diff --git a/.gitignore b/.gitignore index 4b7e9ce..8501c5d 100644 --- a/.gitignore +++ b/.gitignore @@ -15,4 +15,5 @@ config-*.json config.yaml config-*.yaml logs/* -local_archive/ \ No newline at end of file +local_archive/ +vk_config*.json \ No newline at end of file diff --git a/Pipfile b/Pipfile index fedfd51..1b55d86 100644 --- a/Pipfile +++ b/Pipfile @@ -22,6 +22,8 @@ google-auth-oauthlib = "*" oauth2client = "*" python-slugify = "*" pyyaml = "*" +vk-api = "*" +dateparser = "*" [requires] python_version = "3.9" diff --git a/Pipfile.lock b/Pipfile.lock index 9f1a12b..0b911f3 100644 --- a/Pipfile.lock +++ b/Pipfile.lock @@ -1,7 +1,7 @@ { "_meta": { "hash": { - "sha256": "602a05a8fa475181c24714ab57188a417fdfddf373a7dab4fa0ba0fcb7ce8d0a" + "sha256": "d06498403429a8fffcd6d049b314872c0095abee7fb9c6ffd3ba3d7b0c31c8cd" }, "pipfile-spec": 6, "requires": { @@ -50,19 +50,19 @@ }, "boto3": { "hashes": [ - "sha256:28ab0947c49a6fb2409004d4a10b2828aec231cb95ca1d800cb1411e191cc201", - "sha256:833e67edfb73f2cc22ff27a1c33728686dc90a9e81ba2551f9462ea2d1b04f41" + "sha256:0821212ff521cb934801b1f655cef3c0e976775324b1018f1751700d0f42dbb4", + "sha256:87d34861727699c795bf8d65703f2435e75f12879bdd483e08b35b7c5510e8c8" ], "index": "pypi", - "version": "==1.24.8" + "version": "==1.24.9" }, "botocore": { "hashes": [ - "sha256:ad92702930d6cb7b587fc2f619672feb74d5218f8de387a28c2905820db79027", - "sha256:db6667b8dfd175d16187653942cd91dd1f0cf36adc0ea9d7a0805ba4d2a3321f" + "sha256:5669b982b0583e73daef1fe0a4df311055e6287326f857dbb1dcc2de1d8412ad", + "sha256:7a7588b0170e571317496ac4104803329d5bc792bc008e8a757ffd440f1b6fa6" ], "markers": "python_version >= '3.7'", - "version": "==1.27.8" + "version": "==1.27.9" }, "brotli": { "hashes": [ @@ -152,7 +152,7 @@ "sha256:9c5705e395cd70084351dd8ad5c41e65655e08ce46f2ec9cf6c2c08390f71eb7", "sha256:f1d53542ee8cbedbe2118b5686372fb33c297fcd6379b050cca0ef13a597382a" ], - "markers": "python_version >= '3.6'", + "markers": "python_full_version >= '3.6.0'", "version": "==2022.5.18.1" }, "cffi": { @@ -267,6 +267,14 @@ ], "version": "==37.0.2" }, + "dateparser": { + "hashes": [ + "sha256:038196b1f12c7397e38aad3d61588833257f6f552baa63a1499e6987fa8d42d9", + "sha256:9600874312ff28a41f96ec7ccdc73be1d1c44435719da47fea3339d55ff5a628" + ], + "index": "pypi", + "version": "==1.1.1" + }, "ffmpeg-python": { "hashes": [ "sha256:65225db34627c578ef0e11c8b1eb528bb35e024752f6f10b78c011f6f64c4127", @@ -303,7 +311,7 @@ "sha256:958024c6aa3460b08f35741231076a4dd9a4c819a6a39d44da9627febe8b28f0", "sha256:ce1daa49644b50398093d2a9ad886501aa845e2602af70c3001b9f402a9d7359" ], - "markers": "python_version >= '3.6'", + "markers": "python_full_version >= '3.6.0'", "version": "==2.8.1" }, "google-api-python-client": { @@ -316,11 +324,11 @@ }, "google-auth": { "hashes": [ - "sha256:8a954960f852d5f19e6af14dd8e75c20159609e85d8db37e4013cc8c3824a7e1", - "sha256:df549a1433108801b11bdcc0e312eaf0d5f0500db42f0523e4d65c78722e8475" + "sha256:819b70140d05501739e1387291d39f0de3b4dff3b00ae4aff8e7a05369957f89", + "sha256:9b1da39ab8731c3061f36fefde9f8bb902dbee9eb28e3a67e8cfa7dc1be76227" ], "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4, 3.5'", - "version": "==2.7.0" + "version": "==2.8.0" }, "google-auth-httplib2": { "hashes": [ @@ -343,7 +351,7 @@ "sha256:023eaea9d8c1cceccd9587c6af6c20f33eeeb05d4148670f2b0322dc1511700c", "sha256:b09b56f5463070c2153753ef123f07d2e49235e89148e9b2459ec8ed2f68d7d3" ], - "markers": "python_version >= '3.6'", + "markers": "python_full_version >= '3.6.0'", "version": "==1.56.2" }, "gspread": { @@ -359,7 +367,7 @@ "sha256:70813c1135087a248a4d38cc0e1a0181ffab2188141a93eaf567940c3957ff06", "sha256:8ddd78563b633ca55346c8cd41ec0af27d3c79931828beffb46ce70a379e7442" ], - "markers": "python_version >= '3.6'", + "markers": "python_full_version >= '3.6.0'", "version": "==0.13.0" }, "httplib2": { @@ -554,7 +562,7 @@ "sha256:23a8208d75b902797ea29fd31fa80a15ed9dc2c6c16fe73f5d346f83f6fa27a2", "sha256:6db33440354787f9b7f3a6dbd4febf5d0f93758354060e802f6c06cb493022fe" ], - "markers": "python_version >= '3.6'", + "markers": "python_full_version >= '3.6.0'", "version": "==3.2.0" }, "outcome": { @@ -682,7 +690,7 @@ "sha256:5eb116118f9612ff1ee89ac96437bb6b49e8f04d8a13b514ba26f620208e26eb", "sha256:dc9c10fb40944260f6ed4c688ece0cd2048414940f1cea51b8b226318411c519" ], - "markers": "python_version >= '3.6'", + "markers": "python_full_version >= '3.6.0'", "version": "==2.12.0" }, "pyopenssl": { @@ -724,6 +732,21 @@ "index": "pypi", "version": "==6.1.2" }, + "pytz": { + "hashes": [ + "sha256:1e760e2fe6a8163bc0b3d9a19c4f84342afa0a2affebfaa84b01b978a02ecaa7", + "sha256:e68985985296d9a66a881eb3193b0906246245294a881e7c8afe623866ac6a5c" + ], + "version": "==2022.1" + }, + "pytz-deprecation-shim": { + "hashes": [ + "sha256:8314c9692a636c8eb3bda879b9f119e350e93223ae83e70e80c31675a0fdc1a6", + "sha256:af097bae1b616dde5c5744441e2ddc69e74dfdcb0c263129610d85b87445a59d" + ], + "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4, 3.5'", + "version": "==0.1.0.post0" + }, "pyyaml": { "hashes": [ "sha256:0283c35a6a9fbf047493e3a0ce8d79ef5030852c51e9d911a27badfde0605293", @@ -763,7 +786,88 @@ "index": "pypi", "version": "==6.0" }, + "regex": { + "hashes": [ + "sha256:0008650041531d0eadecc96a73d37c2dc4821cf51b0766e374cb4f1ddc4e1c14", + "sha256:03299b0bcaa7824eb7c0ebd7ef1e3663302d1b533653bfe9dc7e595d453e2ae9", + "sha256:06b1df01cf2aef3a9790858af524ae2588762c8a90e784ba00d003f045306204", + "sha256:09b4b6ccc61d4119342b26246ddd5a04accdeebe36bdfe865ad87a0784efd77f", + "sha256:0be0c34a39e5d04a62fd5342f0886d0e57592a4f4993b3f9d257c1f688b19737", + "sha256:0d96eec8550fd2fd26f8e675f6d8b61b159482ad8ffa26991b894ed5ee19038b", + "sha256:0eb0e2845e81bdea92b8281a3969632686502565abf4a0b9e4ab1471c863d8f3", + "sha256:13bbf0c9453c6d16e5867bda7f6c0c7cff1decf96c5498318bb87f8136d2abd4", + "sha256:17e51ad1e6131c496b58d317bc9abec71f44eb1957d32629d06013a21bc99cac", + "sha256:1977bb64264815d3ef016625adc9df90e6d0e27e76260280c63eca993e3f455f", + "sha256:1e30762ddddb22f7f14c4f59c34d3addabc789216d813b0f3e2788d7bcf0cf29", + "sha256:1e73652057473ad3e6934944af090852a02590c349357b79182c1b681da2c772", + "sha256:20e6a27959f162f979165e496add0d7d56d7038237092d1aba20b46de79158f1", + "sha256:286ff9ec2709d56ae7517040be0d6c502642517ce9937ab6d89b1e7d0904f863", + "sha256:297c42ede2c81f0cb6f34ea60b5cf6dc965d97fa6936c11fc3286019231f0d66", + "sha256:320c2f4106962ecea0f33d8d31b985d3c185757c49c1fb735501515f963715ed", + "sha256:35ed2f3c918a00b109157428abfc4e8d1ffabc37c8f9abc5939ebd1e95dabc47", + "sha256:3d146e5591cb67c5e836229a04723a30af795ef9b70a0bbd913572e14b7b940f", + "sha256:42bb37e2b2d25d958c25903f6125a41aaaa1ed49ca62c103331f24b8a459142f", + "sha256:42d6007722d46bd2c95cce700181570b56edc0dcbadbfe7855ec26c3f2d7e008", + "sha256:43eba5c46208deedec833663201752e865feddc840433285fbadee07b84b464d", + "sha256:452519bc4c973e961b1620c815ea6dd8944a12d68e71002be5a7aff0a8361571", + "sha256:4b9c16a807b17b17c4fa3a1d8c242467237be67ba92ad24ff51425329e7ae3d0", + "sha256:5510932596a0f33399b7fff1bd61c59c977f2b8ee987b36539ba97eb3513584a", + "sha256:55820bc631684172b9b56a991d217ec7c2e580d956591dc2144985113980f5a3", + "sha256:57484d39447f94967e83e56db1b1108c68918c44ab519b8ecfc34b790ca52bf7", + "sha256:58ba41e462653eaf68fc4a84ec4d350b26a98d030be1ab24aba1adcc78ffe447", + "sha256:5bc5f921be39ccb65fdda741e04b2555917a4bced24b4df14eddc7569be3b493", + "sha256:5dcc4168536c8f68654f014a3db49b6b4a26b226f735708be2054314ed4964f4", + "sha256:5f92a7cdc6a0ae2abd184e8dfd6ef2279989d24c85d2c85d0423206284103ede", + "sha256:67250b36edfa714ba62dc62d3f238e86db1065fccb538278804790f578253640", + "sha256:6df070a986fc064d865c381aecf0aaff914178fdf6874da2f2387e82d93cc5bd", + "sha256:729aa8ca624c42f309397c5fc9e21db90bf7e2fdd872461aabdbada33de9063c", + "sha256:72bc3a5effa5974be6d965ed8301ac1e869bc18425c8a8fac179fbe7876e3aee", + "sha256:74d86e8924835f863c34e646392ef39039405f6ce52956d8af16497af4064a30", + "sha256:79e5af1ff258bc0fe0bdd6f69bc4ae33935a898e3cbefbbccf22e88a27fa053b", + "sha256:7b103dffb9f6a47ed7ffdf352b78cfe058b1777617371226c1894e1be443afec", + "sha256:83f03f0bd88c12e63ca2d024adeee75234d69808b341e88343b0232329e1f1a1", + "sha256:86d7a68fa53688e1f612c3246044157117403c7ce19ebab7d02daf45bd63913e", + "sha256:878c626cbca3b649e14e972c14539a01191d79e58934e3f3ef4a9e17f90277f8", + "sha256:878f5d649ba1db9f52cc4ef491f7dba2d061cdc48dd444c54260eebc0b1729b9", + "sha256:87bc01226cd288f0bd9a4f9f07bf6827134dc97a96c22e2d28628e824c8de231", + "sha256:8babb2b5751105dc0aef2a2e539f4ba391e738c62038d8cb331c710f6b0f3da7", + "sha256:91e0f7e7be77250b808a5f46d90bf0032527d3c032b2131b63dee54753a4d729", + "sha256:9557545c10d52c845f270b665b52a6a972884725aa5cf12777374e18f2ea8960", + "sha256:9ccb0a4ab926016867260c24c192d9df9586e834f5db83dfa2c8fffb3a6e5056", + "sha256:9d828c5987d543d052b53c579a01a52d96b86f937b1777bbfe11ef2728929357", + "sha256:9efa41d1527b366c88f265a227b20bcec65bda879962e3fc8a2aee11e81266d7", + "sha256:aaf5317c961d93c1a200b9370fb1c6b6836cc7144fef3e5a951326912bf1f5a3", + "sha256:ab69b4fe09e296261377d209068d52402fb85ef89dc78a9ac4a29a895f4e24a7", + "sha256:ad397bc7d51d69cb07ef89e44243f971a04ce1dca9bf24c992c362406c0c6573", + "sha256:ae17fc8103f3b63345709d3e9654a274eee1c6072592aec32b026efd401931d0", + "sha256:af4d8cc28e4c7a2f6a9fed544228c567340f8258b6d7ea815b62a72817bbd178", + "sha256:b22ff939a8856a44f4822da38ef4868bd3a9ade22bb6d9062b36957c850e404f", + "sha256:b549d851f91a4efb3e65498bd4249b1447ab6035a9972f7fc215eb1f59328834", + "sha256:be319f4eb400ee567b722e9ea63d5b2bb31464e3cf1b016502e3ee2de4f86f5c", + "sha256:c0446b2871335d5a5e9fcf1462f954586b09a845832263db95059dcd01442015", + "sha256:c68d2c04f7701a418ec2e5631b7f3552efc32f6bcc1739369c6eeb1af55f62e0", + "sha256:c87ac58b9baaf50b6c1b81a18d20eda7e2883aa9a4fb4f1ca70f2e443bfcdc57", + "sha256:caa2734ada16a44ae57b229d45091f06e30a9a52ace76d7574546ab23008c635", + "sha256:cb34c2d66355fb70ae47b5595aafd7218e59bb9c00ad8cc3abd1406ca5874f07", + "sha256:cb3652bbe6720786b9137862205986f3ae54a09dec8499a995ed58292bdf77c2", + "sha256:cf668f26604e9f7aee9f8eaae4ca07a948168af90b96be97a4b7fa902a6d2ac1", + "sha256:d326ff80ed531bf2507cba93011c30fff2dd51454c85f55df0f59f2030b1687b", + "sha256:d6c2441538e4fadd4291c8420853431a229fcbefc1bf521810fbc2629d8ae8c2", + "sha256:d6ecfd1970b3380a569d7b3ecc5dd70dba295897418ed9e31ec3c16a5ab099a5", + "sha256:e5602a9b5074dcacc113bba4d2f011d2748f50e3201c8139ac5b68cf2a76bd8b", + "sha256:ef806f684f17dbd6263d72a54ad4073af42b42effa3eb42b877e750c24c76f86", + "sha256:f3356afbb301ec34a500b8ba8b47cba0b44ed4641c306e1dd981a08b416170b5", + "sha256:f6f7ee2289176cb1d2c59a24f50900f8b9580259fa9f1a739432242e7d254f93", + "sha256:f7e8f1ee28e0a05831c92dc1c0c1c94af5289963b7cf09eca5b5e3ce4f8c91b0", + "sha256:f8169ec628880bdbca67082a9196e2106060a4a5cbd486ac51881a4df805a36f", + "sha256:fbc88d3ba402b5d041d204ec2449c4078898f89c4a6e6f0ed1c1a510ef1e221d", + "sha256:fbd3fe37353c62fd0eb19fb76f78aa693716262bcd5f9c14bb9e5aca4b3f0dc4" + ], + "markers": "python_full_version >= '3.6.0'", + "version": "==2022.3.2" + }, "requests": { + "extras": [], "hashes": [ "sha256:bc7861137fbce630f17b03d3ad02ad0bf978c844f3536d0edda6499dafce2b6f", "sha256:d568723a7ebd25875d8d1eaf5dfa068cd2fc8194b2e483d7b1f7c81918dbec6b" @@ -799,7 +903,7 @@ "sha256:5c6bd9dc7a543b7fe4304a631f8a8a3b674e2bbfc49c2ae96200cdbe55df6b17", "sha256:95c5d300c4e879ee69708c428ba566c59478fd653cc3a22243eeb8ed846950bb" ], - "markers": "python_version >= '3.6' and python_version < '4'", + "markers": "python_version < '4' and python_full_version >= '3.6.0'", "version": "==4.8" }, "s3transfer": { @@ -853,7 +957,7 @@ "sha256:3b2503d3c7084a42b1ebd08116e5f81aadfaea95863628c80a3b774a11b7c759", "sha256:fc53893b3da2c33de295667a0e19f078c14bf86544af307354de5fcf12a3f30d" ], - "markers": "python_version >= '3.6'", + "markers": "python_full_version >= '3.6.0'", "version": "==2.3.2.post1" }, "telethon": { @@ -902,12 +1006,28 @@ "markers": "python_version >= '3.5'", "version": "==0.9.2" }, + "tzdata": { + "hashes": [ + "sha256:238e70234214138ed7b4e8a0fab0e5e13872edab3be586ab8198c407620e2ab9", + "sha256:8b536a8ec63dc0751342b3984193a3118f8fca2afe25752bb9b7fffd398552d3" + ], + "markers": "python_full_version >= '3.6.0'", + "version": "==2022.1" + }, + "tzlocal": { + "hashes": [ + "sha256:89885494684c929d9191c57aa27502afc87a579be5cdd3225c77c463ea043745", + "sha256:ee5842fa3a795f023514ac2d801c4a81d1743bbe642e3940143326b3a00addd7" + ], + "markers": "python_full_version >= '3.6.0'", + "version": "==4.2" + }, "uritemplate": { "hashes": [ "sha256:4346edfc5c3b79f694bccd6d6099a322bbeb628dbf2cd86eea55a456ce5124f0", "sha256:830c08b8d99bdd312ea4ead05994a38e8936266f84b9a7878232db50b044e02e" ], - "markers": "python_version >= '3.6'", + "markers": "python_full_version >= '3.6.0'", "version": "==4.1.1" }, "urllib3": { @@ -922,6 +1042,14 @@ "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4' and python_version < '4'", "version": "==1.26.9" }, + "vk-api": { + "hashes": [ + "sha256:11c731e214ebc7fa911db81efb021f97587493a5402b992f24748fe1cd9d7afc", + "sha256:d0ae766fa93a40d47c5da045d94201721bf766dbde122a1d2253516b35c5edf3" + ], + "index": "pypi", + "version": "==11.9.8" + }, "websockets": { "hashes": [ "sha256:07cdc0a5b2549bcfbadb585ad8471ebdc7bdf91e32e34ae3889001c1c106a6af", diff --git a/README.md b/README.md index 39204a2..e60774c 100644 --- a/README.md +++ b/README.md @@ -140,7 +140,7 @@ With this configuration, the archiver should archive and store all media added t # auto_auto_archiver -To make it easier to set up new auto-archiver sheets, the auto-auto-archiver will look at a particular sheet and run the auto-archiver on every sheet name in column A, starting from row 11. (It starts here to support instructional text in the first rows of the sheet, as shown below.) This script takes one command line argument, with `--sheet`, the name of the sheet. It must be shared with the same service account. +To make it easier to set up new auto-archiver sheets, the auto-auto-archiver will look at a particular sheet and run the auto-archiver on every sheet name in column A, starting from row 11. (It starts here to support instructional text in the first rows of the sheet, as shown below.) You can simply use your default config as for `auto_archiver.py` but use `--sheet` to specify the name of the sheet that lists the names of sheets to archive.It must be shared with the same service account. ![A screenshot of a Google Spreadsheet configured to show instructional text and a list of sheet names to check with auto-archiver.](docs/auto-auto.png) @@ -152,15 +152,16 @@ Code is split into functional concepts: 1. [GWorksheet](utils/gworksheet.py) - facilitates some of the reading/writing tasks for a Google Worksheet ### Current Archivers -Archivers are tested in a meaningful order with Wayback Machine being the default, that can easily be changed in the code. +Archivers are tested in a meaningful order with Wayback Machine being the failsafe, that can easily be changed in the code. ```mermaid graph TD - A(Archiver) -->|parent of| B(YoutubeDLArchiver) - A -->|parent of| C(TikTokArchiver) - A -->|parent of| D(TwitterArchiver) + A(Archiver) -->|parent of| B(TelethonArchiver) + A -->|parent of| C(TiktokArchiver) + A -->|parent of| D(YoutubeDLArchiver) A -->|parent of| E(TelegramArchiver) - A -->|parent of| F(TelethonArchiver) - A -->|parent of| G(WaybackArchiver) + A -->|parent of| F(TwitterArchiver) + A -->|parent of| G(VkArchiver) + A -->|parent of| H(WaybackArchiver) ``` ### Current Storages ```mermaid diff --git a/archivers/__init__.py b/archivers/__init__.py index 40fbb4b..33700d1 100644 --- a/archivers/__init__.py +++ b/archivers/__init__.py @@ -5,4 +5,5 @@ from .telethon_archiver import TelethonArchiver from .tiktok_archiver import TiktokArchiver from .wayback_archiver import WaybackArchiver from .youtubedl_archiver import YoutubeDLArchiver -from .twitter_archiver import TwitterArchiver \ No newline at end of file +from .twitter_archiver import TwitterArchiver +from .vk_archiver import VkArchiver \ No newline at end of file diff --git a/archivers/base_archiver.py b/archivers/base_archiver.py index 18e4c1b..a8b0413 100644 --- a/archivers/base_archiver.py +++ b/archivers/base_archiver.py @@ -1,4 +1,4 @@ -import os, datetime, shutil, hashlib, time, requests, re +import os, datetime, shutil, hashlib, time, requests, re, mimetypes from dataclasses import dataclass from abc import ABC, abstractmethod from urllib.parse import urlparse @@ -58,7 +58,13 @@ class Archiver(ABC):

{url}

{self.name} object data:

{object}" page += f"" @@ -77,7 +83,18 @@ class Archiver(ABC): page_cdn = self.storage.get_cdn_url(page_key) return (page_cdn, page_hash, thumbnail) + def _guess_file_type(self, path: str): + """ + Receives a URL or filename and returns global mimetype like 'image' or 'video' + see https://developer.mozilla.org/en-US/docs/Web/HTTP/Basics_of_HTTP/MIME_types/Common_types + """ + mime = mimetypes.guess_type(path)[0] + if mime is not None: + return mime.split("/")[0] + return "" + # eg images in a tweet save to cloud storage + def generate_media_page(self, urls, url, object): """ For a list of media urls, fetch them, upload them @@ -208,12 +225,11 @@ class Archiver(ABC): key = key_folder + fname self.storage.upload(thumbnail_filename, key) - cdn_url = self.storage.get_cdn_url(key) cdn_urls.append(cdn_url) if len(cdn_urls) == 0: - return ('None', 'None') + return ('', '') key_thumb = cdn_urls[int(len(cdn_urls) * 0.1)] diff --git a/archivers/telegram_archiver.py b/archivers/telegram_archiver.py index 22de30e..0b6e777 100644 --- a/archivers/telegram_archiver.py +++ b/archivers/telegram_archiver.py @@ -53,7 +53,6 @@ class TelegramArchiver(Archiver): key = self.get_key(video_id) filename = os.path.join(Storage.TMP_FOLDER, key) - cdn_url = self.storage.get_cdn_url(key) if check_if_exists and self.storage.exists(key): status = 'already archived' @@ -84,5 +83,6 @@ class TelegramArchiver(Archiver): filename, key, duration=duration) os.remove(filename) + cdn_url = self.storage.get_cdn_url(key) return ArchiveResult(status=status, cdn_url=cdn_url, thumbnail=key_thumb, thumbnail_index=thumb_index, duration=duration, title=original_url, timestamp=s.find_all('time')[1].get('datetime'), hash=hash, screenshot=screenshot) diff --git a/archivers/telethon_archiver.py b/archivers/telethon_archiver.py index 18996d8..2f4de02 100644 --- a/archivers/telethon_archiver.py +++ b/archivers/telethon_archiver.py @@ -8,6 +8,7 @@ from telethon.errors import ChannelInvalidError from storages import Storage from .base_archiver import Archiver, ArchiveResult from configs import TelethonConfig +from utils import getattr_or class TelethonArchiver(Archiver): @@ -16,8 +17,9 @@ class TelethonArchiver(Archiver): def __init__(self, storage: Storage, driver, config: TelethonConfig): super().__init__(storage, driver) - self.client = TelegramClient("./anon", config.api_id, config.api_hash) - self.bot_token = config.bot_token + if config: + self.client = TelegramClient("./anon", config.api_id, config.api_hash) + self.bot_token = config.bot_token def _get_media_posts_in_group(self, chat, original_post, max_amp=10): """ @@ -26,8 +28,8 @@ class TelethonArchiver(Archiver): of `max_amp` both ways Returns a list of [post] where each post has media and is in the same grouped_id """ - if original_post.grouped_id is None: - return [original_post] if original_post.media is not None else [] + if getattr_or(original_post, "grouped_id") is None: + return [original_post] if getattr_or(original_post, "media") else [] search_ids = [i for i in range(original_post.id - max_amp, original_post.id + max_amp + 1)] posts = self.client.get_messages(chat, ids=search_ids) @@ -38,6 +40,10 @@ class TelethonArchiver(Archiver): return media def download(self, url, check_if_exists=False): + if not hasattr(self, "client"): + logger.error('Missing Telethon config') + return False + # detect URLs that we definitely cannot handle matches = self.link_pattern.findall(url) if not len(matches): @@ -61,12 +67,14 @@ class TelethonArchiver(Archiver): logger.error(f"Could not fetch telegram {url}. This error can be fixed if you setup a bot_token in addition to api_id and api_hash: {e}") return False + if post is None: return False + media_posts = self._get_media_posts_in_group(chat, post) logger.debug(f'got {len(media_posts)=} for {url=}') screenshot = self.get_screenshot(url) - if len(media_posts) > 1: + if len(media_posts) > 0: key = self.get_html_key(url) if check_if_exists and self.storage.exists(key): @@ -78,7 +86,7 @@ class TelethonArchiver(Archiver): group_id = post.grouped_id if post.grouped_id is not None else post.id uploaded_media = [] message = post.message - for mp in media_posts: + for i, mp in enumerate(media_posts): if len(mp.message) > len(message): message = mp.message filename_dest = os.path.join(Storage.TMP_FOLDER, f'{chat}_{group_id}', str(mp.id)) filename = self.client.download_media(mp.media, filename_dest) @@ -87,22 +95,13 @@ class TelethonArchiver(Archiver): hash = self.get_hash(filename) cdn_url = self.storage.get_cdn_url(key) uploaded_media.append({'cdn_url': cdn_url, 'key': key, 'hash': hash}) + if i == 0: + key_thumb, thumb_index = self.get_thumbnails(filename, key) os.remove(filename) page_cdn, page_hash, _ = self.generate_media_page_html(url, uploaded_media, html.escape(str(post))) - return ArchiveResult(status=status, cdn_url=page_cdn, title=message, timestamp=post.date, hash=page_hash, screenshot=screenshot) - elif len(media_posts) == 1: - key = self.get_key(f'{chat}_{post_id}') - filename = self.client.download_media(post.media, os.path.join(Storage.TMP_FOLDER, key)) - key = filename.split(Storage.TMP_FOLDER)[1].replace(" ", "") - self.storage.upload(filename, key) - hash = self.get_hash(filename) - cdn_url = self.storage.get_cdn_url(key) - key_thumb, thumb_index = self.get_thumbnails(filename, key) - os.remove(filename) - - return ArchiveResult(status=status, cdn_url=cdn_url, title=post.message, thumbnail=key_thumb, thumbnail_index=thumb_index, timestamp=post.date, hash=hash, screenshot=screenshot) + return ArchiveResult(status=status, cdn_url=page_cdn, title=message, timestamp=post.date, hash=page_hash, screenshot=screenshot, thumbnail=key_thumb, thumbnail_index=thumb_index) page_cdn, page_hash, _ = self.generate_media_page_html(url, [], html.escape(str(post))) - return ArchiveResult(status=status, cdn_url=page_cdn, title=post.message, timestamp=post.date, hash=page_hash, screenshot=screenshot) + return ArchiveResult(status=status, cdn_url=page_cdn, title=post.message, timestamp=getattr_or(post, "date"), hash=page_hash, screenshot=screenshot) diff --git a/archivers/vk_archiver.py b/archivers/vk_archiver.py new file mode 100644 index 0000000..e48e9ef --- /dev/null +++ b/archivers/vk_archiver.py @@ -0,0 +1,89 @@ +import re, json, requests + +import vk_api, dateparser +from bs4 import BeautifulSoup +from loguru import logger + +from storages import Storage +from .base_archiver import Archiver, ArchiveResult +from configs import VkConfig + + +class VkArchiver(Archiver): + """" + VK videos are handled by YTDownloader, this archiver gets posts text and images. + Currently only works for /wall posts + """ + name = "vk" + wall_pattern = re.compile(r"(wall.{0,1}\d+_\d+)") + photo_pattern = re.compile(r"(photo.{0,1}\d+_\d+)") + onclick_pattern = re.compile(r"({.*})") + + def __init__(self, storage: Storage, driver, config: VkConfig): + super().__init__(storage, driver) + if config != None: + self.vk_session = vk_api.VkApi(config.username, config.password) + self.vk_session.auth(token_only=True) + + def download(self, url, check_if_exists=False): + # detect URLs that this archiver can handle + _id, method = None, None + if has_wall := self.wall_pattern.search(url): + _id = has_wall[0] + method = self.archive_wall + elif has_photo := self.photo_pattern.search(url): + _id = has_photo[0] + method = self.archive_photo + else: return False + + logger.info(f"found valid {_id=} from {url=}") + proper_url = f'https://vk.com/{_id}' + + # if check if exists will not download again + key = self.get_html_key(proper_url) + if check_if_exists and self.storage.exists(key): + screenshot = self.get_screenshot(proper_url) + cdn_url = self.storage.get_cdn_url(key) + return ArchiveResult(status="already archived", cdn_url=cdn_url, screenshot=screenshot) + + try: + return method(proper_url, _id) + except Exception as e: + logger.error(f"something went wrong with vk archive, possibly 404 causing index out of range, or missing key: {e}") + return False + + def archive_photo(self, photo_url, photo_id): + headers = {"access_token": self.vk_session.token["access_token"], "photos": photo_id.replace("photo", ""), "extended": "1", "v": self.vk_session.api_version} + req = requests.get("https://api.vk.com/method/photos.getById", headers) + res = req.json()["response"][0] + title = res["text"][:200] # more on the page + img_url = res["orig_photo"]["url"] + time = dateparser.parse(str(res["date"]), settings={"RETURN_AS_TIMEZONE_AWARE": True, "TO_TIMEZONE": "UTC"}) + + page_cdn, page_hash, thumbnail = self.generate_media_page([img_url], photo_url, res) + screenshot = self.get_screenshot(photo_url) + return ArchiveResult(status="success", cdn_url=page_cdn, screenshot=screenshot, hash=page_hash, thumbnail=thumbnail, timestamp=time, title=title) + + def archive_wall(self, wall_url, wall_id): + headers = {"access_token": self.vk_session.token["access_token"], "posts": wall_id.replace("wall", ""), "extended": "1", "copy_history_depth": "2", "v": self.vk_session.api_version} + req = requests.get("https://api.vk.com/method/wall.getById", headers) + res = req.json()["response"] + wall = res["items"][0] + img_urls = [] + if "attachments" in wall: + for a in wall["attachments"]: + attachment = a[a["type"]] + if "thumb" in attachment: + attachment = attachment["thumb"] + if "sizes" in attachment: + try: img_urls.append(attachment["sizes"][-1]["url"]) + except Exception as e: + logger.warning(f"could not get image from attachment: {e}") + + + title = wall["text"][:200] # more on the page + time = dateparser.parse(str(wall["date"]), settings={"RETURN_AS_TIMEZONE_AWARE": True, "TO_TIMEZONE": "UTC"}) + + page_cdn, page_hash, thumbnail = self.generate_media_page(img_urls, wall_url, res) + screenshot = self.get_screenshot(wall_url) + return ArchiveResult(status="success", cdn_url=page_cdn, screenshot=screenshot, hash=page_hash, thumbnail=thumbnail, timestamp=time, title=title) diff --git a/archivers/wayback_archiver.py b/archivers/wayback_archiver.py index 81c1644..f46d1cb 100644 --- a/archivers/wayback_archiver.py +++ b/archivers/wayback_archiver.py @@ -18,10 +18,12 @@ class WaybackArchiver(Archiver): def __init__(self, storage: Storage, driver, config: WaybackConfig): super(WaybackArchiver, self).__init__(storage, driver) self.config = config - # TODO: this logic should live at the auto-archiver level self.seen_urls = {} def download(self, url, check_if_exists=False): + if self.config is None: + logger.error('Missing Wayback config') + return False if check_if_exists: if url in self.seen_urls: return self.seen_urls[url] @@ -57,7 +59,7 @@ class WaybackArchiver(Archiver): retries += 1 if status_r.status_code != 200: - return ArchiveResult(status="Internet archive failed", screenshot=screenshot) + return ArchiveResult(status=f"Internet archive failed: check https://web.archive.org/save/status/{job_id}", screenshot=screenshot) status_json = status_r.json() if status_json['status'] != 'success': diff --git a/archivers/youtubedl_archiver.py b/archivers/youtubedl_archiver.py index be3477d..7990131 100644 --- a/archivers/youtubedl_archiver.py +++ b/archivers/youtubedl_archiver.py @@ -106,11 +106,11 @@ class YoutubeDLArchiver(Archiver): os.remove(filename) - timestamp = datetime.datetime.utcfromtimestamp(info['timestamp']).replace(tzinfo=datetime.timezone.utc).isoformat() \ - if 'timestamp' in info else \ - datetime.datetime.strptime(info['upload_date'], '%Y%m%d').replace(tzinfo=datetime.timezone.utc) \ - if 'upload_date' in info and info['upload_date'] is not None else \ - None + timestamp = None + if 'timestamp' in info and info['timestamp'] is not None: + timestamp = datetime.datetime.utcfromtimestamp(info['timestamp']).replace(tzinfo=datetime.timezone.utc).isoformat() + elif 'upload_date' in info and info['upload_date'] is not None: + timestamp = datetime.datetime.strptime(info['upload_date'], '%Y%m%d').replace(tzinfo=datetime.timezone.utc) return ArchiveResult(status=status, cdn_url=cdn_url, thumbnail=key_thumb, thumbnail_index=thumb_index, duration=duration, title=info['title'] if 'title' in info else None, timestamp=timestamp, hash=hash, screenshot=screenshot) diff --git a/auto_archive.py b/auto_archive.py index 8c5643a..2b8a9e9 100644 --- a/auto_archive.py +++ b/auto_archive.py @@ -3,7 +3,7 @@ import os, datetime, shutil, traceback, random from loguru import logger from slugify import slugify -from archivers import TelethonArchiver, TelegramArchiver, TiktokArchiver, YoutubeDLArchiver, TwitterArchiver, WaybackArchiver, ArchiveResult, Archiver +from archivers import TelethonArchiver, TelegramArchiver, TiktokArchiver, YoutubeDLArchiver, TwitterArchiver, VkArchiver, WaybackArchiver, ArchiveResult, Archiver from utils import GWorksheet, mkdir_if_not_exists, expand_url from configs import Config from storages import Storage @@ -95,6 +95,7 @@ def process_sheet(c: Config): YoutubeDLArchiver(storage, c.webdriver, c.facebook_cookie), TelegramArchiver(storage, c.webdriver), TwitterArchiver(storage, c.webdriver), + VkArchiver(storage, c.webdriver, c.vk_config), WaybackArchiver(storage, c.webdriver, c.wayback_config) ] diff --git a/auto_auto_archive.py b/auto_auto_archive.py index a518204..14bb751 100644 --- a/auto_auto_archive.py +++ b/auto_auto_archive.py @@ -1,29 +1,30 @@ -import gspread -import argparse +import shutil import auto_archive from loguru import logger +from configs import Config +from storages import Storage +from utils import mkdir_if_not_exists + def main(): - parser = argparse.ArgumentParser( - description="Automatically use youtube-dl to download media from a Google Sheet") - parser.add_argument("--sheet", action="store", dest="sheet") + c = Config() + c.parse() + logger.info(f'Opening document {c.sheet} to look for sheet names to archive') - args = parser.parse_args() - - logger.info("Opening document " + args.sheet) - - gc = gspread.service_account(filename='service_account.json') - sh = gc.open(args.sheet) + gc = c.gsheets_client + sh = gc.open(c.sheet) wks = sh.get_worksheet(0) values = wks.get_all_values() + mkdir_if_not_exists(Storage.TMP_FOLDER) for i in range(11, len(values)): - sheet_name = values[i][0] + c.sheet = values[i][0] + logger.info(f"Processing {c.sheet}") + auto_archive.process_sheet(c) + c.destroy_webdriver() + shutil.rmtree(Storage.TMP_FOLDER) - logger.info("Processing " + sheet_name) - - auto_archive.process_sheet(sheet_name) if __name__ == "__main__": main() diff --git a/configs/__init__.py b/configs/__init__.py index 5a693ca..f70c9c6 100644 --- a/configs/__init__.py +++ b/configs/__init__.py @@ -1,4 +1,5 @@ from .config import Config from .selenium_config import SeleniumConfig from .telethon_config import TelethonConfig -from .wayback_config import WaybackConfig \ No newline at end of file +from .wayback_config import WaybackConfig +from .vk_config import VkConfig \ No newline at end of file diff --git a/configs/config.py b/configs/config.py index dfe786c..78a5090 100644 --- a/configs/config.py +++ b/configs/config.py @@ -4,11 +4,13 @@ import gspread from loguru import logger from selenium import webdriver from dataclasses import asdict +from selenium.common.exceptions import TimeoutException from utils import GWorksheet, getattr_or from .wayback_config import WaybackConfig from .telethon_config import TelethonConfig from .selenium_config import SeleniumConfig +from .vk_config import VkConfig from storages import Storage, S3Config, S3Storage, GDStorage, GDConfig, LocalStorage, LocalConfig @@ -120,6 +122,7 @@ class Config: secret=secrets["wayback"]["secret"], ) else: + self.wayback_config = None logger.debug(f"'wayback' key not present in the {self.config_file=}") # telethon config @@ -130,8 +133,19 @@ class Config: bot_token=secrets["telegram"].get("bot_token", None) ) else: + self.telegram_config = None logger.debug(f"'telegram' key not present in the {self.config_file=}") + # vk config + if "vk" in secrets: + self.vk_config = VkConfig( + username=secrets["vk"]["username"], + password=secrets["vk"]["password"] + ) + else: + self.vk_config = None + logger.debug(f"'vk' key not present in the {self.config_file=}") + del self.config["secrets"] # delete to prevent leaks def set_log_files(self): @@ -197,16 +211,23 @@ class Config: def destroy_webdriver(self): if self.webdriver is not None and type(self.webdriver) != str: self.webdriver.quit() + del self.webdriver def recreate_webdriver(self): - self.destroy_webdriver() options = webdriver.FirefoxOptions() options.headless = True options.set_preference('network.protocol-handler.external.tg', False) - self.webdriver = webdriver.Firefox(options=options) - self.webdriver.set_window_size(self.selenium_config.window_width, + try: + new_webdriver = webdriver.Firefox(options=options) + # only destroy if creation is successful + self.destroy_webdriver() + self.webdriver = new_webdriver + self.webdriver.set_window_size(self.selenium_config.window_width, self.selenium_config.window_height) - self.webdriver.set_page_load_timeout(self.selenium_config.timeout_seconds) + self.webdriver.set_page_load_timeout(self.selenium_config.timeout_seconds) + except TimeoutException as e: + logger.error(f"failed to get new webdriver, possibly due to insufficient system resources or timeout settings: {e}") + def __str__(self) -> str: return json.dumps({ @@ -225,6 +246,7 @@ class Config: "local_config": hasattr(self, "local_config"), "wayback_config": self.wayback_config != None, "telegram_config": self.telegram_config != None, + "vk_config": self.vk_config != None, "gsheets_client": self.gsheets_client != None, "column_names": self.column_names, }, ensure_ascii=False, indent=4) diff --git a/configs/vk_config.py b/configs/vk_config.py new file mode 100644 index 0000000..db2e61c --- /dev/null +++ b/configs/vk_config.py @@ -0,0 +1,8 @@ + +from dataclasses import dataclass + + +@dataclass +class VkConfig: + username: str + password: str diff --git a/example.config.yaml b/example.config.yaml index 0c568c2..9026ad4 100644 --- a/example.config.yaml +++ b/example.config.yaml @@ -39,6 +39,11 @@ secrets: # optional, but allows access to more content such as large videos, talk to @botfather bot_token: your bot-token + # vkontakte (vk.com) credentials + vk: + username: "phone number or email" + password: "password" + google_sheets: # local filename: defaults to service_account.json, see https://gspread.readthedocs.io/en/latest/oauth2.html#for-bots-using-service-account service_account: "service_account.json"