From 59afe7fd6305a036b1fad5706e5d22b929786579 Mon Sep 17 00:00:00 2001 From: msramalho <19508417+msramalho@users.noreply.github.com> Date: Wed, 15 Jun 2022 16:38:18 +0200 Subject: [PATCH 01/23] vk-archiver implemented --- .gitignore | 3 +- Pipfile | 2 + Pipfile.lock | 166 ++++++++++++++++++++++++++++++++++----- archivers/__init__.py | 3 +- archivers/vk_archiver.py | 72 +++++++++++++++++ auto_archive.py | 3 +- configs/__init__.py | 3 +- configs/config.py | 14 ++++ configs/vk_config.py | 8 ++ 9 files changed, 251 insertions(+), 23 deletions(-) create mode 100644 archivers/vk_archiver.py create mode 100644 configs/vk_config.py diff --git a/.gitignore b/.gitignore index 4b7e9ce..8501c5d 100644 --- a/.gitignore +++ b/.gitignore @@ -15,4 +15,5 @@ config-*.json config.yaml config-*.yaml logs/* -local_archive/ \ No newline at end of file +local_archive/ +vk_config*.json \ No newline at end of file diff --git a/Pipfile b/Pipfile index fedfd51..1b55d86 100644 --- a/Pipfile +++ b/Pipfile @@ -22,6 +22,8 @@ google-auth-oauthlib = "*" oauth2client = "*" python-slugify = "*" pyyaml = "*" +vk-api = "*" +dateparser = "*" [requires] python_version = "3.9" diff --git a/Pipfile.lock b/Pipfile.lock index 9f1a12b..0b911f3 100644 --- a/Pipfile.lock +++ b/Pipfile.lock @@ -1,7 +1,7 @@ { "_meta": { "hash": { - "sha256": "602a05a8fa475181c24714ab57188a417fdfddf373a7dab4fa0ba0fcb7ce8d0a" + "sha256": "d06498403429a8fffcd6d049b314872c0095abee7fb9c6ffd3ba3d7b0c31c8cd" }, "pipfile-spec": 6, "requires": { @@ -50,19 +50,19 @@ }, "boto3": { "hashes": [ - "sha256:28ab0947c49a6fb2409004d4a10b2828aec231cb95ca1d800cb1411e191cc201", - "sha256:833e67edfb73f2cc22ff27a1c33728686dc90a9e81ba2551f9462ea2d1b04f41" + "sha256:0821212ff521cb934801b1f655cef3c0e976775324b1018f1751700d0f42dbb4", + "sha256:87d34861727699c795bf8d65703f2435e75f12879bdd483e08b35b7c5510e8c8" ], "index": "pypi", - "version": "==1.24.8" + "version": "==1.24.9" }, "botocore": { "hashes": [ - "sha256:ad92702930d6cb7b587fc2f619672feb74d5218f8de387a28c2905820db79027", - "sha256:db6667b8dfd175d16187653942cd91dd1f0cf36adc0ea9d7a0805ba4d2a3321f" + "sha256:5669b982b0583e73daef1fe0a4df311055e6287326f857dbb1dcc2de1d8412ad", + "sha256:7a7588b0170e571317496ac4104803329d5bc792bc008e8a757ffd440f1b6fa6" ], "markers": "python_version >= '3.7'", - "version": "==1.27.8" + "version": "==1.27.9" }, "brotli": { "hashes": [ @@ -152,7 +152,7 @@ "sha256:9c5705e395cd70084351dd8ad5c41e65655e08ce46f2ec9cf6c2c08390f71eb7", "sha256:f1d53542ee8cbedbe2118b5686372fb33c297fcd6379b050cca0ef13a597382a" ], - "markers": "python_version >= '3.6'", + "markers": "python_full_version >= '3.6.0'", "version": "==2022.5.18.1" }, "cffi": { @@ -267,6 +267,14 @@ ], "version": "==37.0.2" }, + "dateparser": { + "hashes": [ + "sha256:038196b1f12c7397e38aad3d61588833257f6f552baa63a1499e6987fa8d42d9", + "sha256:9600874312ff28a41f96ec7ccdc73be1d1c44435719da47fea3339d55ff5a628" + ], + "index": "pypi", + "version": "==1.1.1" + }, "ffmpeg-python": { "hashes": [ "sha256:65225db34627c578ef0e11c8b1eb528bb35e024752f6f10b78c011f6f64c4127", @@ -303,7 +311,7 @@ "sha256:958024c6aa3460b08f35741231076a4dd9a4c819a6a39d44da9627febe8b28f0", "sha256:ce1daa49644b50398093d2a9ad886501aa845e2602af70c3001b9f402a9d7359" ], - "markers": "python_version >= '3.6'", + "markers": "python_full_version >= '3.6.0'", "version": "==2.8.1" }, "google-api-python-client": { @@ -316,11 +324,11 @@ }, "google-auth": { "hashes": [ - "sha256:8a954960f852d5f19e6af14dd8e75c20159609e85d8db37e4013cc8c3824a7e1", - "sha256:df549a1433108801b11bdcc0e312eaf0d5f0500db42f0523e4d65c78722e8475" + "sha256:819b70140d05501739e1387291d39f0de3b4dff3b00ae4aff8e7a05369957f89", + "sha256:9b1da39ab8731c3061f36fefde9f8bb902dbee9eb28e3a67e8cfa7dc1be76227" ], "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4, 3.5'", - "version": "==2.7.0" + "version": "==2.8.0" }, "google-auth-httplib2": { "hashes": [ @@ -343,7 +351,7 @@ "sha256:023eaea9d8c1cceccd9587c6af6c20f33eeeb05d4148670f2b0322dc1511700c", "sha256:b09b56f5463070c2153753ef123f07d2e49235e89148e9b2459ec8ed2f68d7d3" ], - "markers": "python_version >= '3.6'", + "markers": "python_full_version >= '3.6.0'", "version": "==1.56.2" }, "gspread": { @@ -359,7 +367,7 @@ "sha256:70813c1135087a248a4d38cc0e1a0181ffab2188141a93eaf567940c3957ff06", "sha256:8ddd78563b633ca55346c8cd41ec0af27d3c79931828beffb46ce70a379e7442" ], - "markers": "python_version >= '3.6'", + "markers": "python_full_version >= '3.6.0'", "version": "==0.13.0" }, "httplib2": { @@ -554,7 +562,7 @@ "sha256:23a8208d75b902797ea29fd31fa80a15ed9dc2c6c16fe73f5d346f83f6fa27a2", "sha256:6db33440354787f9b7f3a6dbd4febf5d0f93758354060e802f6c06cb493022fe" ], - "markers": "python_version >= '3.6'", + "markers": "python_full_version >= '3.6.0'", "version": "==3.2.0" }, "outcome": { @@ -682,7 +690,7 @@ "sha256:5eb116118f9612ff1ee89ac96437bb6b49e8f04d8a13b514ba26f620208e26eb", "sha256:dc9c10fb40944260f6ed4c688ece0cd2048414940f1cea51b8b226318411c519" ], - "markers": "python_version >= '3.6'", + "markers": "python_full_version >= '3.6.0'", "version": "==2.12.0" }, "pyopenssl": { @@ -724,6 +732,21 @@ "index": "pypi", "version": "==6.1.2" }, + "pytz": { + "hashes": [ + "sha256:1e760e2fe6a8163bc0b3d9a19c4f84342afa0a2affebfaa84b01b978a02ecaa7", + "sha256:e68985985296d9a66a881eb3193b0906246245294a881e7c8afe623866ac6a5c" + ], + "version": "==2022.1" + }, + "pytz-deprecation-shim": { + "hashes": [ + "sha256:8314c9692a636c8eb3bda879b9f119e350e93223ae83e70e80c31675a0fdc1a6", + "sha256:af097bae1b616dde5c5744441e2ddc69e74dfdcb0c263129610d85b87445a59d" + ], + "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4, 3.5'", + "version": "==0.1.0.post0" + }, "pyyaml": { "hashes": [ "sha256:0283c35a6a9fbf047493e3a0ce8d79ef5030852c51e9d911a27badfde0605293", @@ -763,7 +786,88 @@ "index": "pypi", "version": "==6.0" }, + "regex": { + "hashes": [ + "sha256:0008650041531d0eadecc96a73d37c2dc4821cf51b0766e374cb4f1ddc4e1c14", + "sha256:03299b0bcaa7824eb7c0ebd7ef1e3663302d1b533653bfe9dc7e595d453e2ae9", + "sha256:06b1df01cf2aef3a9790858af524ae2588762c8a90e784ba00d003f045306204", + "sha256:09b4b6ccc61d4119342b26246ddd5a04accdeebe36bdfe865ad87a0784efd77f", + "sha256:0be0c34a39e5d04a62fd5342f0886d0e57592a4f4993b3f9d257c1f688b19737", + "sha256:0d96eec8550fd2fd26f8e675f6d8b61b159482ad8ffa26991b894ed5ee19038b", + "sha256:0eb0e2845e81bdea92b8281a3969632686502565abf4a0b9e4ab1471c863d8f3", + "sha256:13bbf0c9453c6d16e5867bda7f6c0c7cff1decf96c5498318bb87f8136d2abd4", + "sha256:17e51ad1e6131c496b58d317bc9abec71f44eb1957d32629d06013a21bc99cac", + "sha256:1977bb64264815d3ef016625adc9df90e6d0e27e76260280c63eca993e3f455f", + "sha256:1e30762ddddb22f7f14c4f59c34d3addabc789216d813b0f3e2788d7bcf0cf29", + "sha256:1e73652057473ad3e6934944af090852a02590c349357b79182c1b681da2c772", + "sha256:20e6a27959f162f979165e496add0d7d56d7038237092d1aba20b46de79158f1", + "sha256:286ff9ec2709d56ae7517040be0d6c502642517ce9937ab6d89b1e7d0904f863", + "sha256:297c42ede2c81f0cb6f34ea60b5cf6dc965d97fa6936c11fc3286019231f0d66", + "sha256:320c2f4106962ecea0f33d8d31b985d3c185757c49c1fb735501515f963715ed", + "sha256:35ed2f3c918a00b109157428abfc4e8d1ffabc37c8f9abc5939ebd1e95dabc47", + "sha256:3d146e5591cb67c5e836229a04723a30af795ef9b70a0bbd913572e14b7b940f", + "sha256:42bb37e2b2d25d958c25903f6125a41aaaa1ed49ca62c103331f24b8a459142f", + "sha256:42d6007722d46bd2c95cce700181570b56edc0dcbadbfe7855ec26c3f2d7e008", + "sha256:43eba5c46208deedec833663201752e865feddc840433285fbadee07b84b464d", + "sha256:452519bc4c973e961b1620c815ea6dd8944a12d68e71002be5a7aff0a8361571", + "sha256:4b9c16a807b17b17c4fa3a1d8c242467237be67ba92ad24ff51425329e7ae3d0", + "sha256:5510932596a0f33399b7fff1bd61c59c977f2b8ee987b36539ba97eb3513584a", + "sha256:55820bc631684172b9b56a991d217ec7c2e580d956591dc2144985113980f5a3", + "sha256:57484d39447f94967e83e56db1b1108c68918c44ab519b8ecfc34b790ca52bf7", + "sha256:58ba41e462653eaf68fc4a84ec4d350b26a98d030be1ab24aba1adcc78ffe447", + "sha256:5bc5f921be39ccb65fdda741e04b2555917a4bced24b4df14eddc7569be3b493", + "sha256:5dcc4168536c8f68654f014a3db49b6b4a26b226f735708be2054314ed4964f4", + "sha256:5f92a7cdc6a0ae2abd184e8dfd6ef2279989d24c85d2c85d0423206284103ede", + "sha256:67250b36edfa714ba62dc62d3f238e86db1065fccb538278804790f578253640", + "sha256:6df070a986fc064d865c381aecf0aaff914178fdf6874da2f2387e82d93cc5bd", + "sha256:729aa8ca624c42f309397c5fc9e21db90bf7e2fdd872461aabdbada33de9063c", + "sha256:72bc3a5effa5974be6d965ed8301ac1e869bc18425c8a8fac179fbe7876e3aee", + "sha256:74d86e8924835f863c34e646392ef39039405f6ce52956d8af16497af4064a30", + "sha256:79e5af1ff258bc0fe0bdd6f69bc4ae33935a898e3cbefbbccf22e88a27fa053b", + "sha256:7b103dffb9f6a47ed7ffdf352b78cfe058b1777617371226c1894e1be443afec", + "sha256:83f03f0bd88c12e63ca2d024adeee75234d69808b341e88343b0232329e1f1a1", + "sha256:86d7a68fa53688e1f612c3246044157117403c7ce19ebab7d02daf45bd63913e", + "sha256:878c626cbca3b649e14e972c14539a01191d79e58934e3f3ef4a9e17f90277f8", + "sha256:878f5d649ba1db9f52cc4ef491f7dba2d061cdc48dd444c54260eebc0b1729b9", + "sha256:87bc01226cd288f0bd9a4f9f07bf6827134dc97a96c22e2d28628e824c8de231", + "sha256:8babb2b5751105dc0aef2a2e539f4ba391e738c62038d8cb331c710f6b0f3da7", + "sha256:91e0f7e7be77250b808a5f46d90bf0032527d3c032b2131b63dee54753a4d729", + "sha256:9557545c10d52c845f270b665b52a6a972884725aa5cf12777374e18f2ea8960", + "sha256:9ccb0a4ab926016867260c24c192d9df9586e834f5db83dfa2c8fffb3a6e5056", + "sha256:9d828c5987d543d052b53c579a01a52d96b86f937b1777bbfe11ef2728929357", + "sha256:9efa41d1527b366c88f265a227b20bcec65bda879962e3fc8a2aee11e81266d7", + "sha256:aaf5317c961d93c1a200b9370fb1c6b6836cc7144fef3e5a951326912bf1f5a3", + "sha256:ab69b4fe09e296261377d209068d52402fb85ef89dc78a9ac4a29a895f4e24a7", + "sha256:ad397bc7d51d69cb07ef89e44243f971a04ce1dca9bf24c992c362406c0c6573", + "sha256:ae17fc8103f3b63345709d3e9654a274eee1c6072592aec32b026efd401931d0", + "sha256:af4d8cc28e4c7a2f6a9fed544228c567340f8258b6d7ea815b62a72817bbd178", + "sha256:b22ff939a8856a44f4822da38ef4868bd3a9ade22bb6d9062b36957c850e404f", + "sha256:b549d851f91a4efb3e65498bd4249b1447ab6035a9972f7fc215eb1f59328834", + "sha256:be319f4eb400ee567b722e9ea63d5b2bb31464e3cf1b016502e3ee2de4f86f5c", + "sha256:c0446b2871335d5a5e9fcf1462f954586b09a845832263db95059dcd01442015", + "sha256:c68d2c04f7701a418ec2e5631b7f3552efc32f6bcc1739369c6eeb1af55f62e0", + "sha256:c87ac58b9baaf50b6c1b81a18d20eda7e2883aa9a4fb4f1ca70f2e443bfcdc57", + "sha256:caa2734ada16a44ae57b229d45091f06e30a9a52ace76d7574546ab23008c635", + "sha256:cb34c2d66355fb70ae47b5595aafd7218e59bb9c00ad8cc3abd1406ca5874f07", + "sha256:cb3652bbe6720786b9137862205986f3ae54a09dec8499a995ed58292bdf77c2", + "sha256:cf668f26604e9f7aee9f8eaae4ca07a948168af90b96be97a4b7fa902a6d2ac1", + "sha256:d326ff80ed531bf2507cba93011c30fff2dd51454c85f55df0f59f2030b1687b", + "sha256:d6c2441538e4fadd4291c8420853431a229fcbefc1bf521810fbc2629d8ae8c2", + "sha256:d6ecfd1970b3380a569d7b3ecc5dd70dba295897418ed9e31ec3c16a5ab099a5", + "sha256:e5602a9b5074dcacc113bba4d2f011d2748f50e3201c8139ac5b68cf2a76bd8b", + "sha256:ef806f684f17dbd6263d72a54ad4073af42b42effa3eb42b877e750c24c76f86", + "sha256:f3356afbb301ec34a500b8ba8b47cba0b44ed4641c306e1dd981a08b416170b5", + "sha256:f6f7ee2289176cb1d2c59a24f50900f8b9580259fa9f1a739432242e7d254f93", + "sha256:f7e8f1ee28e0a05831c92dc1c0c1c94af5289963b7cf09eca5b5e3ce4f8c91b0", + "sha256:f8169ec628880bdbca67082a9196e2106060a4a5cbd486ac51881a4df805a36f", + "sha256:fbc88d3ba402b5d041d204ec2449c4078898f89c4a6e6f0ed1c1a510ef1e221d", + "sha256:fbd3fe37353c62fd0eb19fb76f78aa693716262bcd5f9c14bb9e5aca4b3f0dc4" + ], + "markers": "python_full_version >= '3.6.0'", + "version": "==2022.3.2" + }, "requests": { + "extras": [], "hashes": [ "sha256:bc7861137fbce630f17b03d3ad02ad0bf978c844f3536d0edda6499dafce2b6f", "sha256:d568723a7ebd25875d8d1eaf5dfa068cd2fc8194b2e483d7b1f7c81918dbec6b" @@ -799,7 +903,7 @@ "sha256:5c6bd9dc7a543b7fe4304a631f8a8a3b674e2bbfc49c2ae96200cdbe55df6b17", "sha256:95c5d300c4e879ee69708c428ba566c59478fd653cc3a22243eeb8ed846950bb" ], - "markers": "python_version >= '3.6' and python_version < '4'", + "markers": "python_version < '4' and python_full_version >= '3.6.0'", "version": "==4.8" }, "s3transfer": { @@ -853,7 +957,7 @@ "sha256:3b2503d3c7084a42b1ebd08116e5f81aadfaea95863628c80a3b774a11b7c759", "sha256:fc53893b3da2c33de295667a0e19f078c14bf86544af307354de5fcf12a3f30d" ], - "markers": "python_version >= '3.6'", + "markers": "python_full_version >= '3.6.0'", "version": "==2.3.2.post1" }, "telethon": { @@ -902,12 +1006,28 @@ "markers": "python_version >= '3.5'", "version": "==0.9.2" }, + "tzdata": { + "hashes": [ + "sha256:238e70234214138ed7b4e8a0fab0e5e13872edab3be586ab8198c407620e2ab9", + "sha256:8b536a8ec63dc0751342b3984193a3118f8fca2afe25752bb9b7fffd398552d3" + ], + "markers": "python_full_version >= '3.6.0'", + "version": "==2022.1" + }, + "tzlocal": { + "hashes": [ + "sha256:89885494684c929d9191c57aa27502afc87a579be5cdd3225c77c463ea043745", + "sha256:ee5842fa3a795f023514ac2d801c4a81d1743bbe642e3940143326b3a00addd7" + ], + "markers": "python_full_version >= '3.6.0'", + "version": "==4.2" + }, "uritemplate": { "hashes": [ "sha256:4346edfc5c3b79f694bccd6d6099a322bbeb628dbf2cd86eea55a456ce5124f0", "sha256:830c08b8d99bdd312ea4ead05994a38e8936266f84b9a7878232db50b044e02e" ], - "markers": "python_version >= '3.6'", + "markers": "python_full_version >= '3.6.0'", "version": "==4.1.1" }, "urllib3": { @@ -922,6 +1042,14 @@ "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4' and python_version < '4'", "version": "==1.26.9" }, + "vk-api": { + "hashes": [ + "sha256:11c731e214ebc7fa911db81efb021f97587493a5402b992f24748fe1cd9d7afc", + "sha256:d0ae766fa93a40d47c5da045d94201721bf766dbde122a1d2253516b35c5edf3" + ], + "index": "pypi", + "version": "==11.9.8" + }, "websockets": { "hashes": [ "sha256:07cdc0a5b2549bcfbadb585ad8471ebdc7bdf91e32e34ae3889001c1c106a6af", diff --git a/archivers/__init__.py b/archivers/__init__.py index 40fbb4b..33700d1 100644 --- a/archivers/__init__.py +++ b/archivers/__init__.py @@ -5,4 +5,5 @@ from .telethon_archiver import TelethonArchiver from .tiktok_archiver import TiktokArchiver from .wayback_archiver import WaybackArchiver from .youtubedl_archiver import YoutubeDLArchiver -from .twitter_archiver import TwitterArchiver \ No newline at end of file +from .twitter_archiver import TwitterArchiver +from .vk_archiver import VkArchiver \ No newline at end of file diff --git a/archivers/vk_archiver.py b/archivers/vk_archiver.py new file mode 100644 index 0000000..91eb0db --- /dev/null +++ b/archivers/vk_archiver.py @@ -0,0 +1,72 @@ +import re, json + +import vk_api, dateparser +from bs4 import BeautifulSoup +from loguru import logger + +from storages import Storage +from .base_archiver import Archiver, ArchiveResult +from configs import VkConfig + + +class VkArchiver(Archiver): + """" + VK videos are handled by YTDownloader, this archiver gets posts text and images. + Currently only works for /wall posts + """ + name = "vk" + wall_pattern = re.compile(r"(wall.{0,1}\d+_\d+)") + onclick_pattern = re.compile(r"({.*})") + + def __init__(self, storage: Storage, driver, config: VkConfig): + super().__init__(storage, driver) + if config != None: + self.vk_session = vk_api.VkApi(config.username, config.password) + self.vk_session.auth(token_only=True) + + def download(self, url, check_if_exists=False): + # detect URLs that this archiver can handle + has_wall = self.wall_pattern.search(url) + if has_wall: + wall_url = f'https://vk.com/{has_wall[0]}' + logger.info(f"found valid wall id from {url=} : {wall_url=}") + return self.archive_wall(wall_url, check_if_exists) + return False + + def archive_wall(self, wall_url, check_if_exists): + res = self.vk_session.http.get(wall_url).text + soup = BeautifulSoup(res, "html.parser") + image_urls = [] + time = None + try: + rel_date = soup.find("a", class_="post_link").find("span", class_="rel_date") + t = rel_date.get_text() + if "time" in rel_date.attrs: + t = rel_date["time"] + elif "abs_time" in rel_date.attrs: + t = rel_date["abs_time"] + time = dateparser.parse(t, settings={"RETURN_AS_TIMEZONE_AWARE": True, "TO_TIMEZONE": "UTC"}) + except Exception as e: + logger.warning(f"could not fetch time from post: {e}") + + post = soup.find("div", class_="wall_text") + post_text = post.find(class_="wall_post_text").get_text() + for anchor in post.find_all("a", attrs={"aria-label": "photo"}): + if img_url := self.get_image_from_anchor(anchor): + image_urls.append(img_url) + + page_cdn, page_hash, thumbnail = self.generate_media_page(image_urls, wall_url, post_text, requester=self.vk_session.http) + screenshot = self.get_screenshot(wall_url) + return ArchiveResult(status="success", cdn_url=page_cdn, screenshot=screenshot, hash=page_hash, thumbnail=thumbnail, timestamp=time) + + def get_image_from_anchor(self, anchor): + try: + # get anchor.onlick text, retrieve the JSON value there + # retrieve "temp"."z" which contains the image with more quality + temp_json = json.loads(self.onclick_pattern.search(anchor["onclick"])[0])["temp"] + for quality in ["z", "y", "x"]: # decreasing quality + if quality in temp_json: + return temp_json[quality] + except Exception as e: + logger.warning(f"failed to get image from vk wall anchor: {e}") + return False diff --git a/auto_archive.py b/auto_archive.py index 8c5643a..2b8a9e9 100644 --- a/auto_archive.py +++ b/auto_archive.py @@ -3,7 +3,7 @@ import os, datetime, shutil, traceback, random from loguru import logger from slugify import slugify -from archivers import TelethonArchiver, TelegramArchiver, TiktokArchiver, YoutubeDLArchiver, TwitterArchiver, WaybackArchiver, ArchiveResult, Archiver +from archivers import TelethonArchiver, TelegramArchiver, TiktokArchiver, YoutubeDLArchiver, TwitterArchiver, VkArchiver, WaybackArchiver, ArchiveResult, Archiver from utils import GWorksheet, mkdir_if_not_exists, expand_url from configs import Config from storages import Storage @@ -95,6 +95,7 @@ def process_sheet(c: Config): YoutubeDLArchiver(storage, c.webdriver, c.facebook_cookie), TelegramArchiver(storage, c.webdriver), TwitterArchiver(storage, c.webdriver), + VkArchiver(storage, c.webdriver, c.vk_config), WaybackArchiver(storage, c.webdriver, c.wayback_config) ] diff --git a/configs/__init__.py b/configs/__init__.py index 5a693ca..f70c9c6 100644 --- a/configs/__init__.py +++ b/configs/__init__.py @@ -1,4 +1,5 @@ from .config import Config from .selenium_config import SeleniumConfig from .telethon_config import TelethonConfig -from .wayback_config import WaybackConfig \ No newline at end of file +from .wayback_config import WaybackConfig +from .vk_config import VkConfig \ No newline at end of file diff --git a/configs/config.py b/configs/config.py index dfe786c..3cef93a 100644 --- a/configs/config.py +++ b/configs/config.py @@ -9,6 +9,7 @@ from utils import GWorksheet, getattr_or from .wayback_config import WaybackConfig from .telethon_config import TelethonConfig from .selenium_config import SeleniumConfig +from .vk_config import VkConfig from storages import Storage, S3Config, S3Storage, GDStorage, GDConfig, LocalStorage, LocalConfig @@ -120,6 +121,7 @@ class Config: secret=secrets["wayback"]["secret"], ) else: + self.wayback_config = None logger.debug(f"'wayback' key not present in the {self.config_file=}") # telethon config @@ -130,8 +132,19 @@ class Config: bot_token=secrets["telegram"].get("bot_token", None) ) else: + self.telegram_config = None logger.debug(f"'telegram' key not present in the {self.config_file=}") + # vk config + if "vk" in secrets: + self.vk_config = VkConfig( + username=secrets["vk"]["username"], + password=secrets["vk"]["password"] + ) + else: + self.vk_config = None + logger.debug(f"'vk' key not present in the {self.config_file=}") + del self.config["secrets"] # delete to prevent leaks def set_log_files(self): @@ -225,6 +238,7 @@ class Config: "local_config": hasattr(self, "local_config"), "wayback_config": self.wayback_config != None, "telegram_config": self.telegram_config != None, + "vk_config": self.vk_config != None, "gsheets_client": self.gsheets_client != None, "column_names": self.column_names, }, ensure_ascii=False, indent=4) diff --git a/configs/vk_config.py b/configs/vk_config.py new file mode 100644 index 0000000..db2e61c --- /dev/null +++ b/configs/vk_config.py @@ -0,0 +1,8 @@ + +from dataclasses import dataclass + + +@dataclass +class VkConfig: + username: str + password: str From 951b16ba9c19053836973709eea9bacab00e7fb3 Mon Sep 17 00:00:00 2001 From: msramalho <19508417+msramalho@users.noreply.github.com> Date: Wed, 15 Jun 2022 16:38:30 +0200 Subject: [PATCH 02/23] improving media page with images and videos --- archivers/base_archiver.py | 26 ++++++++++++++++++++++---- 1 file changed, 22 insertions(+), 4 deletions(-) diff --git a/archivers/base_archiver.py b/archivers/base_archiver.py index 18e4c1b..eb508c0 100644 --- a/archivers/base_archiver.py +++ b/archivers/base_archiver.py @@ -1,4 +1,4 @@ -import os, datetime, shutil, hashlib, time, requests, re +import os, datetime, shutil, hashlib, time, requests, re, mimetypes from dataclasses import dataclass from abc import ABC, abstractmethod from urllib.parse import urlparse @@ -58,7 +58,13 @@ class Archiver(ABC):

{url}

{self.name} object data:

{object}" page += f"" @@ -77,8 +83,20 @@ class Archiver(ABC): page_cdn = self.storage.get_cdn_url(page_key) return (page_cdn, page_hash, thumbnail) + def _guess_file_type(self, path:str): + """ + Receives a URL or filename and returns global mimetype like 'image' or 'video' and the specific mimetype as a tuple + see https://developer.mozilla.org/en-US/docs/Web/HTTP/Basics_of_HTTP/MIME_types/Common_types + ex: ('audio', 'audio/mp3') + """ + mime = mimetypes.guess_type(path)[0] + if mime is not None: + return mime.split("/")[0], mime + return "", "" + + # eg images in a tweet save to cloud storage - def generate_media_page(self, urls, url, object): + def generate_media_page(self, urls, url, object, requester=requests): """ For a list of media urls, fetch them, upload them and call self.generate_media_page_html with them @@ -94,7 +112,7 @@ class Archiver(ABC): filename = os.path.join(Storage.TMP_FOLDER, key) - d = requests.get(media_url, headers=headers) + d = requester.get(media_url, headers=headers) with open(filename, 'wb') as f: f.write(d.content) From 771c5376c4b9706c315bd7efb69bde6f4f19f170 Mon Sep 17 00:00:00 2001 From: msramalho <19508417+msramalho@users.noreply.github.com> Date: Wed, 15 Jun 2022 16:47:20 +0200 Subject: [PATCH 03/23] simplify display --- archivers/base_archiver.py | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/archivers/base_archiver.py b/archivers/base_archiver.py index eb508c0..76d6267 100644 --- a/archivers/base_archiver.py +++ b/archivers/base_archiver.py @@ -58,12 +58,12 @@ class Archiver(ABC):

{url}

{self.name} object data:

{object}" @@ -83,19 +83,18 @@ class Archiver(ABC): page_cdn = self.storage.get_cdn_url(page_key) return (page_cdn, page_hash, thumbnail) - def _guess_file_type(self, path:str): + def _guess_file_type(self, path: str): """ - Receives a URL or filename and returns global mimetype like 'image' or 'video' and the specific mimetype as a tuple + Receives a URL or filename and returns global mimetype like 'image' or 'video' see https://developer.mozilla.org/en-US/docs/Web/HTTP/Basics_of_HTTP/MIME_types/Common_types - ex: ('audio', 'audio/mp3') """ mime = mimetypes.guess_type(path)[0] if mime is not None: - return mime.split("/")[0], mime - return "", "" - + return mime.split("/")[0] + return "" # eg images in a tweet save to cloud storage + def generate_media_page(self, urls, url, object, requester=requests): """ For a list of media urls, fetch them, upload them From 2dbdf9b8d3e68bc53853c97143c8be370fb4d9a3 Mon Sep 17 00:00:00 2001 From: msramalho <19508417+msramalho@users.noreply.github.com> Date: Wed, 15 Jun 2022 17:04:50 +0200 Subject: [PATCH 04/23] check if exists --- archivers/base_archiver.py | 1 - archivers/vk_archiver.py | 17 +++++++++++++---- 2 files changed, 13 insertions(+), 5 deletions(-) diff --git a/archivers/base_archiver.py b/archivers/base_archiver.py index 76d6267..2ba7c4d 100644 --- a/archivers/base_archiver.py +++ b/archivers/base_archiver.py @@ -225,7 +225,6 @@ class Archiver(ABC): key = key_folder + fname self.storage.upload(thumbnail_filename, key) - cdn_url = self.storage.get_cdn_url(key) cdn_urls.append(cdn_url) diff --git a/archivers/vk_archiver.py b/archivers/vk_archiver.py index 91eb0db..b9d3f6c 100644 --- a/archivers/vk_archiver.py +++ b/archivers/vk_archiver.py @@ -28,12 +28,21 @@ class VkArchiver(Archiver): # detect URLs that this archiver can handle has_wall = self.wall_pattern.search(url) if has_wall: - wall_url = f'https://vk.com/{has_wall[0]}' - logger.info(f"found valid wall id from {url=} : {wall_url=}") - return self.archive_wall(wall_url, check_if_exists) + wall_id = has_wall[0] + wall_url = f'https://vk.com/{wall_id}' + logger.info(f"found valid wall id from {url=} : {wall_id=}") + key = self.get_html_key(wall_url) + + # if check if exists will not download again + if check_if_exists and self.storage.exists(key): + screenshot = self.get_screenshot(wall_url) + cdn_url = self.storage.get_cdn_url(key) + return ArchiveResult(status="already archived", cdn_url=cdn_url, screenshot=screenshot) + + return self.archive_wall(wall_url) return False - def archive_wall(self, wall_url, check_if_exists): + def archive_wall(self, wall_url): res = self.vk_session.http.get(wall_url).text soup = BeautifulSoup(res, "html.parser") image_urls = [] From 5cc21fa4e05ffca94d0f323736d3d6493deed658 Mon Sep 17 00:00:00 2001 From: msramalho <19508417+msramalho@users.noreply.github.com> Date: Wed, 15 Jun 2022 17:04:56 +0200 Subject: [PATCH 05/23] bug fix --- archivers/telegram_archiver.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/archivers/telegram_archiver.py b/archivers/telegram_archiver.py index 22de30e..0b6e777 100644 --- a/archivers/telegram_archiver.py +++ b/archivers/telegram_archiver.py @@ -53,7 +53,6 @@ class TelegramArchiver(Archiver): key = self.get_key(video_id) filename = os.path.join(Storage.TMP_FOLDER, key) - cdn_url = self.storage.get_cdn_url(key) if check_if_exists and self.storage.exists(key): status = 'already archived' @@ -84,5 +83,6 @@ class TelegramArchiver(Archiver): filename, key, duration=duration) os.remove(filename) + cdn_url = self.storage.get_cdn_url(key) return ArchiveResult(status=status, cdn_url=cdn_url, thumbnail=key_thumb, thumbnail_index=thumb_index, duration=duration, title=original_url, timestamp=s.find_all('time')[1].get('datetime'), hash=hash, screenshot=screenshot) From b1f70bb81890eeb7d8502e5273144cecdc92ed79 Mon Sep 17 00:00:00 2001 From: msramalho <19508417+msramalho@users.noreply.github.com> Date: Wed, 15 Jun 2022 17:14:08 +0200 Subject: [PATCH 06/23] minor improvements --- archivers/wayback_archiver.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/archivers/wayback_archiver.py b/archivers/wayback_archiver.py index 81c1644..f46d1cb 100644 --- a/archivers/wayback_archiver.py +++ b/archivers/wayback_archiver.py @@ -18,10 +18,12 @@ class WaybackArchiver(Archiver): def __init__(self, storage: Storage, driver, config: WaybackConfig): super(WaybackArchiver, self).__init__(storage, driver) self.config = config - # TODO: this logic should live at the auto-archiver level self.seen_urls = {} def download(self, url, check_if_exists=False): + if self.config is None: + logger.error('Missing Wayback config') + return False if check_if_exists: if url in self.seen_urls: return self.seen_urls[url] @@ -57,7 +59,7 @@ class WaybackArchiver(Archiver): retries += 1 if status_r.status_code != 200: - return ArchiveResult(status="Internet archive failed", screenshot=screenshot) + return ArchiveResult(status=f"Internet archive failed: check https://web.archive.org/save/status/{job_id}", screenshot=screenshot) status_json = status_r.json() if status_json['status'] != 'success': From 86e1d3545ef85d616bbe51be5141337aa6277062 Mon Sep 17 00:00:00 2001 From: msramalho <19508417+msramalho@users.noreply.github.com> Date: Wed, 15 Jun 2022 17:17:46 +0200 Subject: [PATCH 07/23] fix for missing telethon config --- archivers/telethon_archiver.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/archivers/telethon_archiver.py b/archivers/telethon_archiver.py index 18996d8..bfc7f57 100644 --- a/archivers/telethon_archiver.py +++ b/archivers/telethon_archiver.py @@ -16,8 +16,9 @@ class TelethonArchiver(Archiver): def __init__(self, storage: Storage, driver, config: TelethonConfig): super().__init__(storage, driver) - self.client = TelegramClient("./anon", config.api_id, config.api_hash) - self.bot_token = config.bot_token + if config: + self.client = TelegramClient("./anon", config.api_id, config.api_hash) + self.bot_token = config.bot_token def _get_media_posts_in_group(self, chat, original_post, max_amp=10): """ @@ -38,6 +39,10 @@ class TelethonArchiver(Archiver): return media def download(self, url, check_if_exists=False): + if not hasattr(self, "client"): + logger.error('Missing Telethon config') + return False + # detect URLs that we definitely cannot handle matches = self.link_pattern.findall(url) if not len(matches): From 2f02336403dd186d6041682b237b82a75902838d Mon Sep 17 00:00:00 2001 From: msramalho <19508417+msramalho@users.noreply.github.com> Date: Wed, 15 Jun 2022 17:18:47 +0200 Subject: [PATCH 08/23] example config --- example.config.yaml | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/example.config.yaml b/example.config.yaml index 0c568c2..9026ad4 100644 --- a/example.config.yaml +++ b/example.config.yaml @@ -39,6 +39,11 @@ secrets: # optional, but allows access to more content such as large videos, talk to @botfather bot_token: your bot-token + # vkontakte (vk.com) credentials + vk: + username: "phone number or email" + password: "password" + google_sheets: # local filename: defaults to service_account.json, see https://gspread.readthedocs.io/en/latest/oauth2.html#for-bots-using-service-account service_account: "service_account.json" From c08b5268f730ead8df42ee32ddc9f13f99d058dc Mon Sep 17 00:00:00 2001 From: msramalho <19508417+msramalho@users.noreply.github.com> Date: Wed, 15 Jun 2022 21:25:15 +0200 Subject: [PATCH 09/23] using API instead of scraping --- archivers/base_archiver.py | 4 +- archivers/vk_archiver.py | 85 ++++++++++++++++++-------------------- 2 files changed, 42 insertions(+), 47 deletions(-) diff --git a/archivers/base_archiver.py b/archivers/base_archiver.py index 2ba7c4d..0f490fe 100644 --- a/archivers/base_archiver.py +++ b/archivers/base_archiver.py @@ -95,7 +95,7 @@ class Archiver(ABC): # eg images in a tweet save to cloud storage - def generate_media_page(self, urls, url, object, requester=requests): + def generate_media_page(self, urls, url, object): """ For a list of media urls, fetch them, upload them and call self.generate_media_page_html with them @@ -111,7 +111,7 @@ class Archiver(ABC): filename = os.path.join(Storage.TMP_FOLDER, key) - d = requester.get(media_url, headers=headers) + d = requests.get(media_url, headers=headers) with open(filename, 'wb') as f: f.write(d.content) diff --git a/archivers/vk_archiver.py b/archivers/vk_archiver.py index b9d3f6c..e37c7a5 100644 --- a/archivers/vk_archiver.py +++ b/archivers/vk_archiver.py @@ -1,4 +1,4 @@ -import re, json +import re, json, requests import vk_api, dateparser from bs4 import BeautifulSoup @@ -16,6 +16,7 @@ class VkArchiver(Archiver): """ name = "vk" wall_pattern = re.compile(r"(wall.{0,1}\d+_\d+)") + photo_pattern = re.compile(r"(photo.{0,1}\d+_\d+)") onclick_pattern = re.compile(r"({.*})") def __init__(self, storage: Storage, driver, config: VkConfig): @@ -27,55 +28,49 @@ class VkArchiver(Archiver): def download(self, url, check_if_exists=False): # detect URLs that this archiver can handle has_wall = self.wall_pattern.search(url) + has_photo = self.photo_pattern.search(url) + _id, method = None, None if has_wall: - wall_id = has_wall[0] - wall_url = f'https://vk.com/{wall_id}' - logger.info(f"found valid wall id from {url=} : {wall_id=}") - key = self.get_html_key(wall_url) + _id = has_wall[0] + method = self.archive_wall + elif has_photo: + _id = has_photo[0] + method = self.archive_photo + else: return False - # if check if exists will not download again - if check_if_exists and self.storage.exists(key): - screenshot = self.get_screenshot(wall_url) - cdn_url = self.storage.get_cdn_url(key) - return ArchiveResult(status="already archived", cdn_url=cdn_url, screenshot=screenshot) + logger.info(f"found valid {_id=} from {url=}") + proper_url = f'https://vk.com/{_id}' - return self.archive_wall(wall_url) - return False + # if check if exists will not download again + key = self.get_html_key(proper_url) + if check_if_exists and self.storage.exists(key): + screenshot = self.get_screenshot(proper_url) + cdn_url = self.storage.get_cdn_url(key) + return ArchiveResult(status="already archived", cdn_url=cdn_url, screenshot=screenshot) - def archive_wall(self, wall_url): - res = self.vk_session.http.get(wall_url).text - soup = BeautifulSoup(res, "html.parser") - image_urls = [] - time = None - try: - rel_date = soup.find("a", class_="post_link").find("span", class_="rel_date") - t = rel_date.get_text() - if "time" in rel_date.attrs: - t = rel_date["time"] - elif "abs_time" in rel_date.attrs: - t = rel_date["abs_time"] - time = dateparser.parse(t, settings={"RETURN_AS_TIMEZONE_AWARE": True, "TO_TIMEZONE": "UTC"}) - except Exception as e: - logger.warning(f"could not fetch time from post: {e}") + return method(proper_url, _id) - post = soup.find("div", class_="wall_text") - post_text = post.find(class_="wall_post_text").get_text() - for anchor in post.find_all("a", attrs={"aria-label": "photo"}): - if img_url := self.get_image_from_anchor(anchor): - image_urls.append(img_url) + def archive_photo(self, photo_url, photo_id): + headers = {"access_token": self.vk_session.token["access_token"], "photos": photo_id.replace("photo", ""), "extended": "1", "v": self.vk_session.api_version} + req = requests.get("https://api.vk.com/method/photos.getById", headers) + res = req.json()["response"][0] + img_url = res["orig_photo"]["url"] + time = dateparser.parse(str(res["date"]), settings={"RETURN_AS_TIMEZONE_AWARE": True, "TO_TIMEZONE": "UTC"}) - page_cdn, page_hash, thumbnail = self.generate_media_page(image_urls, wall_url, post_text, requester=self.vk_session.http) - screenshot = self.get_screenshot(wall_url) + page_cdn, page_hash, thumbnail = self.generate_media_page([img_url], photo_url, res) + screenshot = self.get_screenshot(photo_url) return ArchiveResult(status="success", cdn_url=page_cdn, screenshot=screenshot, hash=page_hash, thumbnail=thumbnail, timestamp=time) - def get_image_from_anchor(self, anchor): - try: - # get anchor.onlick text, retrieve the JSON value there - # retrieve "temp"."z" which contains the image with more quality - temp_json = json.loads(self.onclick_pattern.search(anchor["onclick"])[0])["temp"] - for quality in ["z", "y", "x"]: # decreasing quality - if quality in temp_json: - return temp_json[quality] - except Exception as e: - logger.warning(f"failed to get image from vk wall anchor: {e}") - return False + def archive_wall(self, wall_url, wall_id): + headers = {"access_token": self.vk_session.token["access_token"], "posts": wall_id.replace("wall", ""), "extended": "1", "copy_history_depth": "2", "v": self.vk_session.api_version} + req = requests.get("https://api.vk.com/method/wall.getById", headers) + res = req.json()["response"] + wall = res["items"][0] + img_urls = [p[p["type"]]["sizes"][-1]["url"] for p in wall["attachments"]] if "attachments" in wall else [] + title = wall["text"][0:200] # more on the page + time = dateparser.parse(str(wall["date"]), settings={"RETURN_AS_TIMEZONE_AWARE": True, "TO_TIMEZONE": "UTC"}) + + page_cdn, page_hash, thumbnail = self.generate_media_page(img_urls, wall_url, res) + screenshot = self.get_screenshot(wall_url) + return ArchiveResult(status="success", cdn_url=page_cdn, screenshot=screenshot, hash=page_hash, thumbnail=thumbnail, timestamp=time, title=title) + From ed4b193ae7209f723f774f0d78b89c3e87d289eb Mon Sep 17 00:00:00 2001 From: msramalho <19508417+msramalho@users.noreply.github.com> Date: Wed, 15 Jun 2022 22:30:08 +0200 Subject: [PATCH 10/23] walrus --- archivers/vk_archiver.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/archivers/vk_archiver.py b/archivers/vk_archiver.py index e37c7a5..d62a2a5 100644 --- a/archivers/vk_archiver.py +++ b/archivers/vk_archiver.py @@ -27,13 +27,11 @@ class VkArchiver(Archiver): def download(self, url, check_if_exists=False): # detect URLs that this archiver can handle - has_wall = self.wall_pattern.search(url) - has_photo = self.photo_pattern.search(url) _id, method = None, None - if has_wall: + if has_wall := self.wall_pattern.search(url): _id = has_wall[0] method = self.archive_wall - elif has_photo: + elif has_photo := self.photo_pattern.search(url): _id = has_photo[0] method = self.archive_photo else: return False From 3b6678818e053e47ac5e450faa5e8361aa893af4 Mon Sep 17 00:00:00 2001 From: msramalho <19508417+msramalho@users.noreply.github.com> Date: Wed, 15 Jun 2022 22:47:55 +0200 Subject: [PATCH 11/23] title for vk photo --- archivers/vk_archiver.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/archivers/vk_archiver.py b/archivers/vk_archiver.py index d62a2a5..e9c999f 100644 --- a/archivers/vk_archiver.py +++ b/archivers/vk_archiver.py @@ -52,12 +52,13 @@ class VkArchiver(Archiver): headers = {"access_token": self.vk_session.token["access_token"], "photos": photo_id.replace("photo", ""), "extended": "1", "v": self.vk_session.api_version} req = requests.get("https://api.vk.com/method/photos.getById", headers) res = req.json()["response"][0] + title = res["text"][:200] # more on the page img_url = res["orig_photo"]["url"] time = dateparser.parse(str(res["date"]), settings={"RETURN_AS_TIMEZONE_AWARE": True, "TO_TIMEZONE": "UTC"}) page_cdn, page_hash, thumbnail = self.generate_media_page([img_url], photo_url, res) screenshot = self.get_screenshot(photo_url) - return ArchiveResult(status="success", cdn_url=page_cdn, screenshot=screenshot, hash=page_hash, thumbnail=thumbnail, timestamp=time) + return ArchiveResult(status="success", cdn_url=page_cdn, screenshot=screenshot, hash=page_hash, thumbnail=thumbnail, timestamp=time, title=title) def archive_wall(self, wall_url, wall_id): headers = {"access_token": self.vk_session.token["access_token"], "posts": wall_id.replace("wall", ""), "extended": "1", "copy_history_depth": "2", "v": self.vk_session.api_version} @@ -65,7 +66,7 @@ class VkArchiver(Archiver): res = req.json()["response"] wall = res["items"][0] img_urls = [p[p["type"]]["sizes"][-1]["url"] for p in wall["attachments"]] if "attachments" in wall else [] - title = wall["text"][0:200] # more on the page + title = wall["text"][:200] # more on the page time = dateparser.parse(str(wall["date"]), settings={"RETURN_AS_TIMEZONE_AWARE": True, "TO_TIMEZONE": "UTC"}) page_cdn, page_hash, thumbnail = self.generate_media_page(img_urls, wall_url, res) From 659097c07213d107b63e24c4e77b3776c4aaf71f Mon Sep 17 00:00:00 2001 From: msramalho <19508417+msramalho@users.noreply.github.com> Date: Wed, 15 Jun 2022 22:54:18 +0200 Subject: [PATCH 12/23] better error log --- archivers/vk_archiver.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/archivers/vk_archiver.py b/archivers/vk_archiver.py index e9c999f..ee43e8e 100644 --- a/archivers/vk_archiver.py +++ b/archivers/vk_archiver.py @@ -46,13 +46,17 @@ class VkArchiver(Archiver): cdn_url = self.storage.get_cdn_url(key) return ArchiveResult(status="already archived", cdn_url=cdn_url, screenshot=screenshot) - return method(proper_url, _id) + try: + return method(proper_url, _id) + except Exception as e: + logger.error(f"something went wrong with vk archive, possibly 404 causing index out of range, or missing key: {e}") + return False def archive_photo(self, photo_url, photo_id): headers = {"access_token": self.vk_session.token["access_token"], "photos": photo_id.replace("photo", ""), "extended": "1", "v": self.vk_session.api_version} req = requests.get("https://api.vk.com/method/photos.getById", headers) res = req.json()["response"][0] - title = res["text"][:200] # more on the page + title = res["text"][:200] # more on the page img_url = res["orig_photo"]["url"] time = dateparser.parse(str(res["date"]), settings={"RETURN_AS_TIMEZONE_AWARE": True, "TO_TIMEZONE": "UTC"}) @@ -66,10 +70,9 @@ class VkArchiver(Archiver): res = req.json()["response"] wall = res["items"][0] img_urls = [p[p["type"]]["sizes"][-1]["url"] for p in wall["attachments"]] if "attachments" in wall else [] - title = wall["text"][:200] # more on the page + title = wall["text"][:200] # more on the page time = dateparser.parse(str(wall["date"]), settings={"RETURN_AS_TIMEZONE_AWARE": True, "TO_TIMEZONE": "UTC"}) page_cdn, page_hash, thumbnail = self.generate_media_page(img_urls, wall_url, res) screenshot = self.get_screenshot(wall_url) return ArchiveResult(status="success", cdn_url=page_cdn, screenshot=screenshot, hash=page_hash, thumbnail=thumbnail, timestamp=time, title=title) - From 08f48ae351d94c6d46f2a7eb6b3d43544bf2595a Mon Sep 17 00:00:00 2001 From: msramalho <19508417+msramalho@users.noreply.github.com> Date: Wed, 15 Jun 2022 23:17:32 +0200 Subject: [PATCH 13/23] handling selenium better --- configs/config.py | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/configs/config.py b/configs/config.py index 3cef93a..78a5090 100644 --- a/configs/config.py +++ b/configs/config.py @@ -4,6 +4,7 @@ import gspread from loguru import logger from selenium import webdriver from dataclasses import asdict +from selenium.common.exceptions import TimeoutException from utils import GWorksheet, getattr_or from .wayback_config import WaybackConfig @@ -210,16 +211,23 @@ class Config: def destroy_webdriver(self): if self.webdriver is not None and type(self.webdriver) != str: self.webdriver.quit() + del self.webdriver def recreate_webdriver(self): - self.destroy_webdriver() options = webdriver.FirefoxOptions() options.headless = True options.set_preference('network.protocol-handler.external.tg', False) - self.webdriver = webdriver.Firefox(options=options) - self.webdriver.set_window_size(self.selenium_config.window_width, + try: + new_webdriver = webdriver.Firefox(options=options) + # only destroy if creation is successful + self.destroy_webdriver() + self.webdriver = new_webdriver + self.webdriver.set_window_size(self.selenium_config.window_width, self.selenium_config.window_height) - self.webdriver.set_page_load_timeout(self.selenium_config.timeout_seconds) + self.webdriver.set_page_load_timeout(self.selenium_config.timeout_seconds) + except TimeoutException as e: + logger.error(f"failed to get new webdriver, possibly due to insufficient system resources or timeout settings: {e}") + def __str__(self) -> str: return json.dumps({ From c6bcb5900562074e75c0d2bd7f0aeaba804b4823 Mon Sep 17 00:00:00 2001 From: msramalho <19508417+msramalho@users.noreply.github.com> Date: Wed, 15 Jun 2022 23:36:10 +0200 Subject: [PATCH 14/23] improvement for albums --- archivers/vk_archiver.py | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/archivers/vk_archiver.py b/archivers/vk_archiver.py index ee43e8e..e48e9ef 100644 --- a/archivers/vk_archiver.py +++ b/archivers/vk_archiver.py @@ -69,7 +69,18 @@ class VkArchiver(Archiver): req = requests.get("https://api.vk.com/method/wall.getById", headers) res = req.json()["response"] wall = res["items"][0] - img_urls = [p[p["type"]]["sizes"][-1]["url"] for p in wall["attachments"]] if "attachments" in wall else [] + img_urls = [] + if "attachments" in wall: + for a in wall["attachments"]: + attachment = a[a["type"]] + if "thumb" in attachment: + attachment = attachment["thumb"] + if "sizes" in attachment: + try: img_urls.append(attachment["sizes"][-1]["url"]) + except Exception as e: + logger.warning(f"could not get image from attachment: {e}") + + title = wall["text"][:200] # more on the page time = dateparser.parse(str(wall["date"]), settings={"RETURN_AS_TIMEZONE_AWARE": True, "TO_TIMEZONE": "UTC"}) From 2ac08a34f633800c073f82bd3f5eedc491456d8d Mon Sep 17 00:00:00 2001 From: msramalho <19508417+msramalho@users.noreply.github.com> Date: Thu, 16 Jun 2022 13:45:02 +0200 Subject: [PATCH 15/23] ydl timestamp bug fix --- archivers/youtubedl_archiver.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/archivers/youtubedl_archiver.py b/archivers/youtubedl_archiver.py index be3477d..7990131 100644 --- a/archivers/youtubedl_archiver.py +++ b/archivers/youtubedl_archiver.py @@ -106,11 +106,11 @@ class YoutubeDLArchiver(Archiver): os.remove(filename) - timestamp = datetime.datetime.utcfromtimestamp(info['timestamp']).replace(tzinfo=datetime.timezone.utc).isoformat() \ - if 'timestamp' in info else \ - datetime.datetime.strptime(info['upload_date'], '%Y%m%d').replace(tzinfo=datetime.timezone.utc) \ - if 'upload_date' in info and info['upload_date'] is not None else \ - None + timestamp = None + if 'timestamp' in info and info['timestamp'] is not None: + timestamp = datetime.datetime.utcfromtimestamp(info['timestamp']).replace(tzinfo=datetime.timezone.utc).isoformat() + elif 'upload_date' in info and info['upload_date'] is not None: + timestamp = datetime.datetime.strptime(info['upload_date'], '%Y%m%d').replace(tzinfo=datetime.timezone.utc) return ArchiveResult(status=status, cdn_url=cdn_url, thumbnail=key_thumb, thumbnail_index=thumb_index, duration=duration, title=info['title'] if 'title' in info else None, timestamp=timestamp, hash=hash, screenshot=screenshot) From 277d81d687ddfbf14ba56defa5a63555df130e40 Mon Sep 17 00:00:00 2001 From: msramalho <19508417+msramalho@users.noreply.github.com> Date: Thu, 16 Jun 2022 14:16:18 +0200 Subject: [PATCH 16/23] telethon minor fix --- archivers/telethon_archiver.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/archivers/telethon_archiver.py b/archivers/telethon_archiver.py index bfc7f57..d36c762 100644 --- a/archivers/telethon_archiver.py +++ b/archivers/telethon_archiver.py @@ -27,7 +27,7 @@ class TelethonArchiver(Archiver): of `max_amp` both ways Returns a list of [post] where each post has media and is in the same grouped_id """ - if original_post.grouped_id is None: + if not hasattr(original_post, "grouped_id") or original_post.grouped_id is None: return [original_post] if original_post.media is not None else [] search_ids = [i for i in range(original_post.id - max_amp, original_post.id + max_amp + 1)] From b37f7adc8fb082dfda1e05422f595a08e84dea8b Mon Sep 17 00:00:00 2001 From: msramalho <19508417+msramalho@users.noreply.github.com> Date: Thu, 16 Jun 2022 14:29:51 +0200 Subject: [PATCH 17/23] another telethon fix --- archivers/telethon_archiver.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/archivers/telethon_archiver.py b/archivers/telethon_archiver.py index d36c762..bbea956 100644 --- a/archivers/telethon_archiver.py +++ b/archivers/telethon_archiver.py @@ -28,7 +28,7 @@ class TelethonArchiver(Archiver): Returns a list of [post] where each post has media and is in the same grouped_id """ if not hasattr(original_post, "grouped_id") or original_post.grouped_id is None: - return [original_post] if original_post.media is not None else [] + return [original_post] if hasattr(original_post, "media") and original_post.media is not None else [] search_ids = [i for i in range(original_post.id - max_amp, original_post.id + max_amp + 1)] posts = self.client.get_messages(chat, ids=search_ids) From ec1993c5dc812b3fbfebf75793b07bc192cb4018 Mon Sep 17 00:00:00 2001 From: msramalho <19508417+msramalho@users.noreply.github.com> Date: Thu, 16 Jun 2022 14:33:50 +0200 Subject: [PATCH 18/23] telethon fix --- archivers/telethon_archiver.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/archivers/telethon_archiver.py b/archivers/telethon_archiver.py index bbea956..ad1fadb 100644 --- a/archivers/telethon_archiver.py +++ b/archivers/telethon_archiver.py @@ -8,6 +8,7 @@ from telethon.errors import ChannelInvalidError from storages import Storage from .base_archiver import Archiver, ArchiveResult from configs import TelethonConfig +from utils import getattr_or class TelethonArchiver(Archiver): @@ -27,8 +28,8 @@ class TelethonArchiver(Archiver): of `max_amp` both ways Returns a list of [post] where each post has media and is in the same grouped_id """ - if not hasattr(original_post, "grouped_id") or original_post.grouped_id is None: - return [original_post] if hasattr(original_post, "media") and original_post.media is not None else [] + if getattr_or(original_post, "grouped_id") is not None: + return [original_post] if getattr_or(original_post, "media") is not None else [] search_ids = [i for i in range(original_post.id - max_amp, original_post.id + max_amp + 1)] posts = self.client.get_messages(chat, ids=search_ids) @@ -110,4 +111,4 @@ class TelethonArchiver(Archiver): return ArchiveResult(status=status, cdn_url=cdn_url, title=post.message, thumbnail=key_thumb, thumbnail_index=thumb_index, timestamp=post.date, hash=hash, screenshot=screenshot) page_cdn, page_hash, _ = self.generate_media_page_html(url, [], html.escape(str(post))) - return ArchiveResult(status=status, cdn_url=page_cdn, title=post.message, timestamp=post.date, hash=page_hash, screenshot=screenshot) + return ArchiveResult(status=status, cdn_url=page_cdn, title=getattr_or(post, "message", ""), timestamp=getattr_or(post, "date"), hash=page_hash, screenshot=screenshot) From 81ce27bdb3030755ca1ff12ba8d4169715b68fcd Mon Sep 17 00:00:00 2001 From: msramalho <19508417+msramalho@users.noreply.github.com> Date: Thu, 16 Jun 2022 14:34:33 +0200 Subject: [PATCH 19/23] fix --- archivers/telethon_archiver.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/archivers/telethon_archiver.py b/archivers/telethon_archiver.py index ad1fadb..6407f09 100644 --- a/archivers/telethon_archiver.py +++ b/archivers/telethon_archiver.py @@ -28,7 +28,7 @@ class TelethonArchiver(Archiver): of `max_amp` both ways Returns a list of [post] where each post has media and is in the same grouped_id """ - if getattr_or(original_post, "grouped_id") is not None: + if getattr_or(original_post, "grouped_id") is None: return [original_post] if getattr_or(original_post, "media") is not None else [] search_ids = [i for i in range(original_post.id - max_amp, original_post.id + max_amp + 1)] From 81eb00a76771526390785e00cf148601dfcf2845 Mon Sep 17 00:00:00 2001 From: msramalho <19508417+msramalho@users.noreply.github.com> Date: Thu, 16 Jun 2022 16:19:57 +0200 Subject: [PATCH 20/23] handle deleted telegram --- archivers/telethon_archiver.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/archivers/telethon_archiver.py b/archivers/telethon_archiver.py index 6407f09..efef1cc 100644 --- a/archivers/telethon_archiver.py +++ b/archivers/telethon_archiver.py @@ -29,7 +29,7 @@ class TelethonArchiver(Archiver): Returns a list of [post] where each post has media and is in the same grouped_id """ if getattr_or(original_post, "grouped_id") is None: - return [original_post] if getattr_or(original_post, "media") is not None else [] + return [original_post] if getattr_or(original_post, "media") else [] search_ids = [i for i in range(original_post.id - max_amp, original_post.id + max_amp + 1)] posts = self.client.get_messages(chat, ids=search_ids) @@ -67,6 +67,8 @@ class TelethonArchiver(Archiver): logger.error(f"Could not fetch telegram {url}. This error can be fixed if you setup a bot_token in addition to api_id and api_hash: {e}") return False + if post is None: return False + media_posts = self._get_media_posts_in_group(chat, post) logger.debug(f'got {len(media_posts)=} for {url=}') @@ -111,4 +113,4 @@ class TelethonArchiver(Archiver): return ArchiveResult(status=status, cdn_url=cdn_url, title=post.message, thumbnail=key_thumb, thumbnail_index=thumb_index, timestamp=post.date, hash=hash, screenshot=screenshot) page_cdn, page_hash, _ = self.generate_media_page_html(url, [], html.escape(str(post))) - return ArchiveResult(status=status, cdn_url=page_cdn, title=getattr_or(post, "message", ""), timestamp=getattr_or(post, "date"), hash=page_hash, screenshot=screenshot) + return ArchiveResult(status=status, cdn_url=page_cdn, title=post.message, timestamp=getattr_or(post, "date"), hash=page_hash, screenshot=screenshot) From afc7e133cf4fb12e4b7ce1e06b3adebf7c437cd0 Mon Sep 17 00:00:00 2001 From: msramalho <19508417+msramalho@users.noreply.github.com> Date: Thu, 16 Jun 2022 16:26:30 +0200 Subject: [PATCH 21/23] simplifying telethon --- archivers/telethon_archiver.py | 19 +++++-------------- 1 file changed, 5 insertions(+), 14 deletions(-) diff --git a/archivers/telethon_archiver.py b/archivers/telethon_archiver.py index efef1cc..2f4de02 100644 --- a/archivers/telethon_archiver.py +++ b/archivers/telethon_archiver.py @@ -74,7 +74,7 @@ class TelethonArchiver(Archiver): screenshot = self.get_screenshot(url) - if len(media_posts) > 1: + if len(media_posts) > 0: key = self.get_html_key(url) if check_if_exists and self.storage.exists(key): @@ -86,7 +86,7 @@ class TelethonArchiver(Archiver): group_id = post.grouped_id if post.grouped_id is not None else post.id uploaded_media = [] message = post.message - for mp in media_posts: + for i, mp in enumerate(media_posts): if len(mp.message) > len(message): message = mp.message filename_dest = os.path.join(Storage.TMP_FOLDER, f'{chat}_{group_id}', str(mp.id)) filename = self.client.download_media(mp.media, filename_dest) @@ -95,22 +95,13 @@ class TelethonArchiver(Archiver): hash = self.get_hash(filename) cdn_url = self.storage.get_cdn_url(key) uploaded_media.append({'cdn_url': cdn_url, 'key': key, 'hash': hash}) + if i == 0: + key_thumb, thumb_index = self.get_thumbnails(filename, key) os.remove(filename) page_cdn, page_hash, _ = self.generate_media_page_html(url, uploaded_media, html.escape(str(post))) - return ArchiveResult(status=status, cdn_url=page_cdn, title=message, timestamp=post.date, hash=page_hash, screenshot=screenshot) - elif len(media_posts) == 1: - key = self.get_key(f'{chat}_{post_id}') - filename = self.client.download_media(post.media, os.path.join(Storage.TMP_FOLDER, key)) - key = filename.split(Storage.TMP_FOLDER)[1].replace(" ", "") - self.storage.upload(filename, key) - hash = self.get_hash(filename) - cdn_url = self.storage.get_cdn_url(key) - key_thumb, thumb_index = self.get_thumbnails(filename, key) - os.remove(filename) - - return ArchiveResult(status=status, cdn_url=cdn_url, title=post.message, thumbnail=key_thumb, thumbnail_index=thumb_index, timestamp=post.date, hash=hash, screenshot=screenshot) + return ArchiveResult(status=status, cdn_url=page_cdn, title=message, timestamp=post.date, hash=page_hash, screenshot=screenshot, thumbnail=key_thumb, thumbnail_index=thumb_index) page_cdn, page_hash, _ = self.generate_media_page_html(url, [], html.escape(str(post))) return ArchiveResult(status=status, cdn_url=page_cdn, title=post.message, timestamp=getattr_or(post, "date"), hash=page_hash, screenshot=screenshot) From cdd66fb7dad8ca64aaad0dbff9cc9c767070f837 Mon Sep 17 00:00:00 2001 From: msramalho <19508417+msramalho@users.noreply.github.com> Date: Thu, 16 Jun 2022 16:30:08 +0200 Subject: [PATCH 22/23] returning empty string thumbs --- archivers/base_archiver.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/archivers/base_archiver.py b/archivers/base_archiver.py index 0f490fe..a8b0413 100644 --- a/archivers/base_archiver.py +++ b/archivers/base_archiver.py @@ -229,7 +229,7 @@ class Archiver(ABC): cdn_urls.append(cdn_url) if len(cdn_urls) == 0: - return ('None', 'None') + return ('', '') key_thumb = cdn_urls[int(len(cdn_urls) * 0.1)] From 14add4392318c84f2ec626b9297eb0db0518fedb Mon Sep 17 00:00:00 2001 From: msramalho <19508417+msramalho@users.noreply.github.com> Date: Thu, 16 Jun 2022 17:17:25 +0200 Subject: [PATCH 23/23] fixing auto_auto_archive --- README.md | 15 ++++++++------- auto_auto_archive.py | 31 ++++++++++++++++--------------- 2 files changed, 24 insertions(+), 22 deletions(-) diff --git a/README.md b/README.md index 39204a2..e60774c 100644 --- a/README.md +++ b/README.md @@ -140,7 +140,7 @@ With this configuration, the archiver should archive and store all media added t # auto_auto_archiver -To make it easier to set up new auto-archiver sheets, the auto-auto-archiver will look at a particular sheet and run the auto-archiver on every sheet name in column A, starting from row 11. (It starts here to support instructional text in the first rows of the sheet, as shown below.) This script takes one command line argument, with `--sheet`, the name of the sheet. It must be shared with the same service account. +To make it easier to set up new auto-archiver sheets, the auto-auto-archiver will look at a particular sheet and run the auto-archiver on every sheet name in column A, starting from row 11. (It starts here to support instructional text in the first rows of the sheet, as shown below.) You can simply use your default config as for `auto_archiver.py` but use `--sheet` to specify the name of the sheet that lists the names of sheets to archive.It must be shared with the same service account. ![A screenshot of a Google Spreadsheet configured to show instructional text and a list of sheet names to check with auto-archiver.](docs/auto-auto.png) @@ -152,15 +152,16 @@ Code is split into functional concepts: 1. [GWorksheet](utils/gworksheet.py) - facilitates some of the reading/writing tasks for a Google Worksheet ### Current Archivers -Archivers are tested in a meaningful order with Wayback Machine being the default, that can easily be changed in the code. +Archivers are tested in a meaningful order with Wayback Machine being the failsafe, that can easily be changed in the code. ```mermaid graph TD - A(Archiver) -->|parent of| B(YoutubeDLArchiver) - A -->|parent of| C(TikTokArchiver) - A -->|parent of| D(TwitterArchiver) + A(Archiver) -->|parent of| B(TelethonArchiver) + A -->|parent of| C(TiktokArchiver) + A -->|parent of| D(YoutubeDLArchiver) A -->|parent of| E(TelegramArchiver) - A -->|parent of| F(TelethonArchiver) - A -->|parent of| G(WaybackArchiver) + A -->|parent of| F(TwitterArchiver) + A -->|parent of| G(VkArchiver) + A -->|parent of| H(WaybackArchiver) ``` ### Current Storages ```mermaid diff --git a/auto_auto_archive.py b/auto_auto_archive.py index a518204..14bb751 100644 --- a/auto_auto_archive.py +++ b/auto_auto_archive.py @@ -1,29 +1,30 @@ -import gspread -import argparse +import shutil import auto_archive from loguru import logger +from configs import Config +from storages import Storage +from utils import mkdir_if_not_exists + def main(): - parser = argparse.ArgumentParser( - description="Automatically use youtube-dl to download media from a Google Sheet") - parser.add_argument("--sheet", action="store", dest="sheet") + c = Config() + c.parse() + logger.info(f'Opening document {c.sheet} to look for sheet names to archive') - args = parser.parse_args() - - logger.info("Opening document " + args.sheet) - - gc = gspread.service_account(filename='service_account.json') - sh = gc.open(args.sheet) + gc = c.gsheets_client + sh = gc.open(c.sheet) wks = sh.get_worksheet(0) values = wks.get_all_values() + mkdir_if_not_exists(Storage.TMP_FOLDER) for i in range(11, len(values)): - sheet_name = values[i][0] + c.sheet = values[i][0] + logger.info(f"Processing {c.sheet}") + auto_archive.process_sheet(c) + c.destroy_webdriver() + shutil.rmtree(Storage.TMP_FOLDER) - logger.info("Processing " + sheet_name) - - auto_archive.process_sheet(sheet_name) if __name__ == "__main__": main()