From 93be1af93f4efb22d2335c9c346349b7fae2f503 Mon Sep 17 00:00:00 2001 From: msramalho <19508417+msramalho@users.noreply.github.com> Date: Tue, 18 Oct 2022 15:45:10 +0100 Subject: [PATCH 001/190] adds instagram post/profile --- .gitignore | 3 +- Pipfile | 4 + Pipfile.lock | 78 +++++++++++++------ archivers/__init__.py | 3 +- archivers/instagram_archiver.py | 128 ++++++++++++++++++++++++++++++++ archivers/telethon_archiver.py | 4 +- auto_archive.py | 5 +- configs/__init__.py | 3 +- configs/config.py | 11 +++ configs/instagram_config.py | 8 ++ example.config.yaml | 5 ++ 11 files changed, 220 insertions(+), 32 deletions(-) create mode 100644 archivers/instagram_archiver.py create mode 100644 configs/instagram_config.py diff --git a/.gitignore b/.gitignore index 4d19b9e..04b03ee 100644 --- a/.gitignore +++ b/.gitignore @@ -21,4 +21,5 @@ gd-token.json credentials.json secrets/* browsertrix/* -browsertrix-tmp/* \ No newline at end of file +browsertrix-tmp/* +instaloader/* \ No newline at end of file diff --git a/Pipfile b/Pipfile index 88fad6a..aa04ea4 100644 --- a/Pipfile +++ b/Pipfile @@ -25,6 +25,10 @@ pyyaml = "*" dateparser = "*" vk-url-scraper = "*" python-twitter-v2 = "*" +instaloader = "*" [requires] python_version = "3.9" + +[dev-packages] +autopep8 = "*" diff --git a/Pipfile.lock b/Pipfile.lock index 271a661..e2d1b1b 100644 --- a/Pipfile.lock +++ b/Pipfile.lock @@ -1,7 +1,7 @@ { "_meta": { "hash": { - "sha256": "1ed953d08e31d891de0f887e520f12025d109a20718b27dd8f9b361f73c95651" + "sha256": "bd987e7237c7e32d2dffb295db633f5a022ce1a718435d11d8ac303c9e37a4d3" }, "pipfile-spec": 6, "requires": { @@ -29,7 +29,7 @@ "sha256:01c7bf666359b4967d2cda0000cc2e4af16a0ae098cbffcb8472fb9e8ad6585b", "sha256:6ebb3d106c12920aaae42ccb6f787ef5eefdcdd166ea3d628fa8476abe712144" ], - "markers": "python_full_version >= '3.5.0'", + "markers": "python_version >= '3.5'", "version": "==1.10" }, "attrs": { @@ -37,7 +37,7 @@ "sha256:29adc2665447e5191d0e7c568fde78b21f9672d344281d0c6e1ab085429b22b6", "sha256:86efa402f67bf2df34f51a335487cf46b1ec130d02b8d39fd248abfd30da551c" ], - "markers": "python_full_version >= '3.5.0'", + "markers": "python_version >= '3.5'", "version": "==22.1.0" }, "authlib": { @@ -159,7 +159,7 @@ "sha256:84c85a9078b11105f04f3036a9482ae10e4621616db313fe045dd24743a0820d", "sha256:fe86415d55e84719d75f8b69414f6438ac3547d2078ab91b67e779ef69378412" ], - "markers": "python_version >= '3.6'", + "markers": "python_full_version >= '3.6.0'", "version": "==2022.6.15" }, "cffi": { @@ -236,7 +236,7 @@ "sha256:2857e29ff0d34db842cd7ca3230549d1a697f96ee6d3fb071cfa6c7393832597", "sha256:6881edbebdb17b39b4eaaa821b438bf6eddffb4468cf344f09f89def34a8b1df" ], - "markers": "python_full_version >= '3.5.0'", + "markers": "python_version >= '3.5'", "version": "==2.0.12" }, "click": { @@ -290,7 +290,7 @@ "sha256:d4ef6cc305394ed669d4d9eebf10d3a101059bdcf2669c366ec1d14e4fb227bd", "sha256:d9e69ae01f99abe6ad646947bba8941e896cb3aa805be2597a0400e0764b5818" ], - "markers": "python_version >= '3.6'", + "markers": "python_full_version >= '3.6.0'", "version": "==38.0.1" }, "dataclasses-json": { @@ -298,7 +298,7 @@ "sha256:bc285b5f892094c3a53d558858a88553dd6a61a11ab1a8128a0e554385dcc5dd", "sha256:c2c11bc8214fbf709ffc369d11446ff6945254a7f09128154a7620613d8fda90" ], - "markers": "python_version >= '3.6'", + "markers": "python_full_version >= '3.6.0'", "version": "==0.5.7" }, "dateparser": { @@ -425,9 +425,16 @@ "sha256:84d9dd047ffa80596e0f246e2eab0b391788b0503584e8945f2368256d2735ff", "sha256:9d643ff0a55b762d5cdb124b8eaa99c66322e2157b69160bc32796e824360e6d" ], - "markers": "python_full_version >= '3.5.0'", + "markers": "python_version >= '3.5'", "version": "==3.3" }, + "instaloader": { + "hashes": [ + "sha256:ba925a87e2c305a3d24173d1bb0457d5a7e2e77dbac7206eeeb46f9104ecb08e" + ], + "index": "pypi", + "version": "==4.9.5" + }, "itsdangerous": { "hashes": [ "sha256:2c2349112351b88699d8d4b6b075022c0808887cb7ad10069318a8b0bc88db44", @@ -625,7 +632,7 @@ "sha256:8139f29aac13e25d502680e9e19963e83f16838d48a0d71c287fe40e7067fbca", "sha256:9859c40929662bec5d64f34d01c99e093149682a3f38915dc0655d5a633dd918" ], - "markers": "python_version >= '3.6'", + "markers": "python_full_version >= '3.6.0'", "version": "==3.2.2" }, "outcome": { @@ -641,7 +648,7 @@ "sha256:dd47c42927d89ab911e606518907cc2d3a1f38bbd026385970643f9c5b8ecfeb", "sha256:ef103e05f519cdc783ae24ea4e2e0f508a9c99b2d4969652eed6a2e1ea5bd522" ], - "markers": "python_version >= '3.6'", + "markers": "python_full_version >= '3.6.0'", "version": "==21.3" }, "protobuf": { @@ -754,7 +761,7 @@ "sha256:56a8508ae95f98e2b9bdf93a6be5ae3f7d8af858b43e02c5a2ff083726be40c1", "sha256:f643f331ab57ba3c9d89212ee4a2dabc6e94f117cf4eefde99a0574720d14c42" ], - "markers": "python_version >= '3.6'", + "markers": "python_full_version >= '3.6.0'", "version": "==2.13.0" }, "pyparsing": { @@ -786,7 +793,7 @@ "sha256:b7e3b04a59693c42c36f9ab1cc2acc46fa5df8c78e178fc33a8d4cd05c8d498f", "sha256:d92a187be61fe482e4fd675b6d52200e7be63a12b724abbf931a40ce4fa92938" ], - "markers": "python_full_version >= '3.5.0'", + "markers": "python_version >= '3.5'", "version": "==0.20.0" }, "python-slugify": { @@ -807,10 +814,9 @@ }, "pytz": { "hashes": [ - "sha256:2c0784747071402c6e99f0bafdb7da0fa22645f06554c7ae06bf6358897e9c91", - "sha256:48ce799d83b6f8aab2020e369b627446696619e79645419610b9facd909b3174" + "sha256:335ab46900b1465e714b4fda4963d87363264eb662aab5e65da039c25f1f5b22" ], - "version": "==2022.4" + "version": "==2022.5" }, "pytz-deprecation-shim": { "hashes": [ @@ -943,7 +949,7 @@ "sha256:fbc88d3ba402b5d041d204ec2449c4078898f89c4a6e6f0ed1c1a510ef1e221d", "sha256:fbd3fe37353c62fd0eb19fb76f78aa693716262bcd5f9c14bb9e5aca4b3f0dc4" ], - "markers": "python_version >= '3.6'", + "markers": "python_full_version >= '3.6.0'", "version": "==2022.3.2" }, "requests": { @@ -983,7 +989,7 @@ "sha256:90260d9058e514786967344d0ef75fa8727eed8a7d2e43ce9f4bcf1b536174f7", "sha256:e38464a49c6c85d7f1351b0126661487a7e0a14a50f1675ec50eb34d4f20ef21" ], - "markers": "python_version >= '3.6' and python_version < '4'", + "markers": "python_version < '4' and python_full_version >= '3.6.0'", "version": "==4.9" }, "s3transfer": { @@ -1037,7 +1043,7 @@ "sha256:3b2503d3c7084a42b1ebd08116e5f81aadfaea95863628c80a3b774a11b7c759", "sha256:fc53893b3da2c33de295667a0e19f078c14bf86544af307354de5fcf12a3f30d" ], - "markers": "python_version >= '3.6'", + "markers": "python_full_version >= '3.6.0'", "version": "==2.3.2.post1" }, "telethon": { @@ -1083,7 +1089,7 @@ "sha256:5b558f6e83cc20a37c3b61202476c5295d1addf57bd65543364e0337e37ed2bc", "sha256:a3d34de8fac26023eee701ed1e7bf4da9a8326b61a62934ec9e53b64970fd8fe" ], - "markers": "python_full_version >= '3.5.0'", + "markers": "python_version >= '3.5'", "version": "==0.9.2" }, "typing-extensions": { @@ -1106,7 +1112,7 @@ "sha256:323161b22b7802fdc78f20ca5f6073639c64f1a7227c40cd3e19fd1d0ce6650a", "sha256:e15b2b3005e2546108af42a0eb4ccab4d9e225e2dfbf4f77aad50c70a4b1f3ab" ], - "markers": "python_version >= '3.6'", + "markers": "python_full_version >= '3.6.0'", "version": "==2022.5" }, "tzlocal": { @@ -1114,7 +1120,7 @@ "sha256:89885494684c929d9191c57aa27502afc87a579be5cdd3225c77c463ea043745", "sha256:ee5842fa3a795f023514ac2d801c4a81d1743bbe642e3940143326b3a00addd7" ], - "markers": "python_version >= '3.6'", + "markers": "python_full_version >= '3.6.0'", "version": "==4.2" }, "uritemplate": { @@ -1122,11 +1128,10 @@ "sha256:4346edfc5c3b79f694bccd6d6099a322bbeb628dbf2cd86eea55a456ce5124f0", "sha256:830c08b8d99bdd312ea4ead05994a38e8936266f84b9a7878232db50b044e02e" ], - "markers": "python_version >= '3.6'", + "markers": "python_full_version >= '3.6.0'", "version": "==4.1.1" }, "urllib3": { - "extras": [], "hashes": [ "sha256:44ece4d53fb1706f667c9bd1c648f5469a2ec925fcf3a776667042d645472c14", "sha256:aabaf16477806a5e1dd19aa41f8c2b7950dd3c746362d7e3223dbe6de6ac448e" @@ -1228,5 +1233,30 @@ "version": "==2022.5.18" } }, - "develop": {} + "develop": { + "autopep8": { + "hashes": [ + "sha256:6f09e90a2be784317e84dc1add17ebfc7abe3924239957a37e5040e27d812087", + "sha256:ca9b1a83e53a7fad65d731dc7a2a2d50aa48f43850407c59f6a1a306c4201142" + ], + "index": "pypi", + "version": "==1.7.0" + }, + "pycodestyle": { + "hashes": [ + "sha256:2c9607871d58c76354b697b42f5d57e1ada7d261c261efac224b664affdc5785", + "sha256:d1735fc58b418fd7c5f658d28d943854f8a849b01a5d0a1e6f3f3fdd0166804b" + ], + "markers": "python_version >= '3.6'", + "version": "==2.9.1" + }, + "toml": { + "hashes": [ + "sha256:806143ae5bfb6a3c6e736a764057db0e6a0e05e338b5630894a5f779cabb4f9b", + "sha256:b3bda1d108d5dd99f4a20d24d9c348e91c4db7ab1b749200bded2f839ccbe68f" + ], + "markers": "python_version >= '2.6' and python_version not in '3.0, 3.1, 3.2, 3.3'", + "version": "==0.10.2" + } + } } diff --git a/archivers/__init__.py b/archivers/__init__.py index 403ebea..7f51e39 100644 --- a/archivers/__init__.py +++ b/archivers/__init__.py @@ -7,4 +7,5 @@ from .wayback_archiver import WaybackArchiver from .youtubedl_archiver import YoutubeDLArchiver from .twitter_archiver import TwitterArchiver from .vk_archiver import VkArchiver -from .twitter_api_archiver import TwitterApiArchiver \ No newline at end of file +from .twitter_api_archiver import TwitterApiArchiver +from .instagram_archiver import InstagramArchiver \ No newline at end of file diff --git a/archivers/instagram_archiver.py b/archivers/instagram_archiver.py new file mode 100644 index 0000000..1539527 --- /dev/null +++ b/archivers/instagram_archiver.py @@ -0,0 +1,128 @@ +import re, os, shutil, html +import instaloader # https://instaloader.github.io/as-module.html +from loguru import logger + +from .base_archiver import Archiver, ArchiveResult +from configs import Config +from storages import Storage + + +class InstagramArchiver(Archiver): + """ + Uses Instaloader to download either a post (inc images, videos, text) or as much as possible from a profile (posts, stories, highlights, ) + """ + name = "instagram" + DOWNLOAD_FOLDER = "instaloader" + # NB: post should be tested before profile + # https://regex101.com/r/MGPquX/1 + post_pattern = re.compile(r"(?:(?:http|https):\/\/)?(?:www.)?(?:instagram.com|instagr.am|instagr.com)\/(?:p|reel)\/(\w+)") + # https://regex101.com/r/6Wbsxa/1 + profile_pattern = re.compile(r"(?:(?:http|https):\/\/)?(?:www.)?(?:instagram.com|instagr.am|instagr.com)\/(\w+)") + + def __init__(self, storage: Storage, config: Config): + super().__init__(storage, config) + self.insta = instaloader.Instaloader(download_geotags=True, download_comments=True, compress_json=False, dirname_pattern=self.DOWNLOAD_FOLDER, filename_pattern="{date_utc}_UTC_{target}__{typename}") + if config.instagram_config: + self.insta.login(config.instagram_config.username, config.instagram_config.password) + + def download(self, url, check_if_exists=False): + post_matches = self.post_pattern.findall(url) + profile_matches = self.profile_pattern.findall(url) + + # return if not a valid instagram link + if not len(post_matches) and not len(profile_matches): + return + + # check if already uploaded + key = self.get_html_key(url) + if check_if_exists and self.storage.exists(key): + # only s3 storage supports storage.exists as not implemented on gd + cdn_url = self.storage.get_cdn_url(key) + screenshot = self.get_screenshot(url) + wacz = self.get_wacz(url) + return ArchiveResult(status='already archived', cdn_url=cdn_url, screenshot=screenshot, wacz=wacz) + + try: + # process if post + if len(post_matches): + return self.download_post(url, post_matches[0]) + + # process if profile + if len(profile_matches): + return self.download_profile(url, profile_matches[0]) + finally: + shutil.rmtree(self.DOWNLOAD_FOLDER, ignore_errors=True) + + def download_post(self, url, post_id): + logger.debug(f"Instagram {post_id=} detected in {url=}") + + post = instaloader.Post.from_shortcode(self.insta.context, post_id) + if self.insta.download_post(post, target=post.owner_username): + return self.upload_downloaded_content(url, post.title, post._asdict(), post.date) + + def download_profile(self, url, username): + # gets posts, posts where username is tagged, igtv postss, stories, and highlights + logger.debug(f"Instagram {username=} detected in {url=}") + + profile = instaloader.Profile.from_username(self.insta.context, username) + try: + for post in profile.get_posts(): + try: self.insta.download_post(post, target=f"profile_post_{post.owner_username}") + except Exception as e: logger.error(f"Failed to download post: {post.shortcode}: {e}") + except Exception as e: logger.error(f"Failed profile.get_posts: {e}") + + try: + for post in profile.get_tagged_posts(): + try: self.insta.download_post(post, target=f"tagged_post_{post.owner_username}") + except Exception as e: logger.error(f"Failed to download tagged post: {post.shortcode}: {e}") + except Exception as e: logger.error(f"Failed profile.get_tagged_posts: {e}") + + try: + for post in profile.get_igtv_posts(): + try: self.insta.download_post(post, target=f"igtv_post_{post.owner_username}") + except Exception as e: logger.error(f"Failed to download igtv post: {post.shortcode}: {e}") + except Exception as e: logger.error(f"Failed profile.get_igtv_posts: {e}") + + try: + for story in self.insta.get_stories([profile.userid]): + for item in story.get_items(): + try: self.insta.download_storyitem(item, target=f"story_item_{story.owner_username}") + except Exception as e: logger.error(f"Failed to download story item: {item}: {e}") + except Exception as e: logger.error(f"Failed get_stories: {e}") + + try: + for highlight in self.insta.get_highlights(profile.userid): + for item in highlight.get_items(): + try: self.insta.download_storyitem(item, target=f"highlight_item_{highlight.owner_username}") + except Exception as e: logger.error(f"Failed to download highlight item: {item}: {e}") + except Exception as e: logger.error(f"Failed get_highlights: {e}") + + return self.upload_downloaded_content(url, f"@{username}", profile._asdict(), None) + + def upload_downloaded_content(self, url, title, content, date): + status = "success" + try: + uploaded_media = [] + for f in os.listdir(self.DOWNLOAD_FOLDER): + if os.path.isfile((filename := os.path.join(self.DOWNLOAD_FOLDER, f))): + key = self.get_key(filename) + self.storage.upload(filename, key) + hash = self.get_hash(filename) + cdn_url = self.storage.get_cdn_url(key) + uploaded_media.append({'cdn_url': cdn_url, 'key': key, 'hash': hash}) + assert len(uploaded_media) > 1, "No uploaded media found" + + uploaded_media.sort(key=lambda m:m["key"], reverse=True) + + page_cdn, page_hash, _ = self.generate_media_page_html(url, uploaded_media, html.escape(str(content))) + except Exception as e: + logger.error(f"Could not fetch instagram post {url} due to: {e}") + status = "error" + finally: + shutil.rmtree(self.DOWNLOAD_FOLDER, ignore_errors=True) + + if status == "success": + screenshot = self.get_screenshot(url) + wacz = self.get_wacz(url) + + return ArchiveResult(status=status, cdn_url=page_cdn, title=title, timestamp=date, hash=page_hash, screenshot=screenshot, wacz=wacz) diff --git a/archivers/telethon_archiver.py b/archivers/telethon_archiver.py index 9f9bbbf..5c147de 100644 --- a/archivers/telethon_archiver.py +++ b/archivers/telethon_archiver.py @@ -1,6 +1,4 @@ -import os, re - -import html +import os, re, html from loguru import logger from telethon.sync import TelegramClient from telethon.errors import ChannelInvalidError diff --git a/auto_archive.py b/auto_archive.py index 50719a3..3412b0a 100644 --- a/auto_archive.py +++ b/auto_archive.py @@ -4,8 +4,8 @@ from loguru import logger from slugify import slugify from urllib.parse import quote -from archivers import TelethonArchiver, TelegramArchiver, TiktokArchiver, YoutubeDLArchiver, TwitterArchiver, TwitterApiArchiver, VkArchiver, WaybackArchiver, ArchiveResult, Archiver -from utils import GWorksheet, mkdir_if_not_exists, expand_url +from archivers import TelethonArchiver, TelegramArchiver, TiktokArchiver, YoutubeDLArchiver, TwitterArchiver, TwitterApiArchiver, VkArchiver, WaybackArchiver, InstagramArchiver, ArchiveResult, Archiver +from utils import GWorksheet, expand_url from configs import Config from storages import Storage @@ -111,6 +111,7 @@ def process_sheet(c: Config): TelethonArchiver(storage, c), TiktokArchiver(storage, c), TwitterApiArchiver(storage, c), + InstagramArchiver(storage, c), YoutubeDLArchiver(storage, c), TelegramArchiver(storage, c), TwitterArchiver(storage, c), diff --git a/configs/__init__.py b/configs/__init__.py index 6940ed3..1f01b62 100644 --- a/configs/__init__.py +++ b/configs/__init__.py @@ -3,4 +3,5 @@ from .selenium_config import SeleniumConfig from .telethon_config import TelethonConfig from .wayback_config import WaybackConfig from .twitter_api_config import TwitterApiConfig -from .vk_config import VkConfig \ No newline at end of file +from .vk_config import VkConfig +from .instagram_config import InstagramConfig \ No newline at end of file diff --git a/configs/config.py b/configs/config.py index 6e97dc4..80e4881 100644 --- a/configs/config.py +++ b/configs/config.py @@ -12,6 +12,7 @@ from .selenium_config import SeleniumConfig from .vk_config import VkConfig from .twitter_api_config import TwitterApiConfig from .browsertrix_config import BrowsertrixConfig +from .instagram_config import InstagramConfig from storages import S3Config, S3Storage, GDStorage, GDConfig, LocalStorage, LocalConfig @@ -180,6 +181,16 @@ class Config: self.vk_config = None logger.debug(f"'vk' key not present in the {self.config_file=}") + # instagram config + if "instagram" in secrets: + self.instagram_config = InstagramConfig( + username=secrets["instagram"]["username"], + password=secrets["instagram"]["password"] + ) + else: + self.instagram_config = None + logger.debug(f"'instagram' key not present in the {self.config_file=}") + del self.config["secrets"] # delete to prevent leaks def set_log_files(self): diff --git a/configs/instagram_config.py b/configs/instagram_config.py new file mode 100644 index 0000000..73c45bc --- /dev/null +++ b/configs/instagram_config.py @@ -0,0 +1,8 @@ + +from dataclasses import dataclass + + +@dataclass +class InstagramConfig: + username: str + password: str diff --git a/example.config.yaml b/example.config.yaml index e42d10f..b26f58f 100644 --- a/example.config.yaml +++ b/example.config.yaml @@ -67,6 +67,11 @@ secrets: username: "phone number or email" password: "password" + # instagram credentials + instagram: + username: "username" + password: "password" + google_sheets: # local filename: defaults to service_account.json, see https://gspread.readthedocs.io/en/latest/oauth2.html#for-bots-using-service-account service_account: "service_account.json" From 3f121d800e0d0dd386a0943710c784357c35949f Mon Sep 17 00:00:00 2001 From: msramalho <19508417+msramalho@users.noreply.github.com> Date: Tue, 18 Oct 2022 16:36:27 +0100 Subject: [PATCH 002/190] catch bad instagram login --- archivers/instagram_archiver.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/archivers/instagram_archiver.py b/archivers/instagram_archiver.py index 1539527..5d2fefe 100644 --- a/archivers/instagram_archiver.py +++ b/archivers/instagram_archiver.py @@ -23,7 +23,11 @@ class InstagramArchiver(Archiver): super().__init__(storage, config) self.insta = instaloader.Instaloader(download_geotags=True, download_comments=True, compress_json=False, dirname_pattern=self.DOWNLOAD_FOLDER, filename_pattern="{date_utc}_UTC_{target}__{typename}") if config.instagram_config: - self.insta.login(config.instagram_config.username, config.instagram_config.password) + try: + self.insta.login(config.instagram_config.username, config.instagram_config. + password) + except Exception as e: + logger.error(f"Unable to finish login: {e}") def download(self, url, check_if_exists=False): post_matches = self.post_pattern.findall(url) From 63f53358d3c92170c85b988d4e284e80a52443ff Mon Sep 17 00:00:00 2001 From: msramalho <19508417+msramalho@users.noreply.github.com> Date: Tue, 18 Oct 2022 16:38:12 +0100 Subject: [PATCH 003/190] adds traceback --- archivers/instagram_archiver.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/archivers/instagram_archiver.py b/archivers/instagram_archiver.py index 5d2fefe..46b53b8 100644 --- a/archivers/instagram_archiver.py +++ b/archivers/instagram_archiver.py @@ -1,4 +1,4 @@ -import re, os, shutil, html +import re, os, shutil, html, traceback import instaloader # https://instaloader.github.io/as-module.html from loguru import logger @@ -27,7 +27,7 @@ class InstagramArchiver(Archiver): self.insta.login(config.instagram_config.username, config.instagram_config. password) except Exception as e: - logger.error(f"Unable to finish login: {e}") + logger.error(f"Unable to finish login: {e}\n{traceback.format_exc()}") def download(self, url, check_if_exists=False): post_matches = self.post_pattern.findall(url) From 6c80a5b82d5df5e2e26afb646ff385318466e556 Mon Sep 17 00:00:00 2001 From: msramalho <19508417+msramalho@users.noreply.github.com> Date: Tue, 18 Oct 2022 17:35:59 +0100 Subject: [PATCH 004/190] session file logic --- .gitignore | 3 ++- archivers/instagram_archiver.py | 10 +++++++++- configs/config.py | 3 ++- configs/instagram_config.py | 1 + example.config.yaml | 1 + 5 files changed, 15 insertions(+), 3 deletions(-) diff --git a/.gitignore b/.gitignore index 04b03ee..59ed096 100644 --- a/.gitignore +++ b/.gitignore @@ -22,4 +22,5 @@ credentials.json secrets/* browsertrix/* browsertrix-tmp/* -instaloader/* \ No newline at end of file +instaloader/* +instaloader.session \ No newline at end of file diff --git a/archivers/instagram_archiver.py b/archivers/instagram_archiver.py index 46b53b8..d7a3989 100644 --- a/archivers/instagram_archiver.py +++ b/archivers/instagram_archiver.py @@ -26,8 +26,16 @@ class InstagramArchiver(Archiver): try: self.insta.login(config.instagram_config.username, config.instagram_config. password) + #TODO: wait for this issue to be fixed https://github.com/instaloader/instaloader/issues/1758 + self.insta.save_session_to_file(config.instagram_config.session_file) except Exception as e: - logger.error(f"Unable to finish login: {e}\n{traceback.format_exc()}") + logger.error(f"Unable to finish login (retrying from file): {e}\n{traceback.format_exc()}") + try: + self.insta.load_session_from_file(config.instagram_config.username, config.instagram_config.session_file) + except Exception as e2: + logger.error(f"Unable to login from session file: {e2}\n{traceback.format_exc()}") + + def download(self, url, check_if_exists=False): post_matches = self.post_pattern.findall(url) diff --git a/configs/config.py b/configs/config.py index 80e4881..372b3d7 100644 --- a/configs/config.py +++ b/configs/config.py @@ -185,7 +185,8 @@ class Config: if "instagram" in secrets: self.instagram_config = InstagramConfig( username=secrets["instagram"]["username"], - password=secrets["instagram"]["password"] + password=secrets["instagram"]["password"], + session_file=secrets["instagram"].get("session_file", "instaloader.session") ) else: self.instagram_config = None diff --git a/configs/instagram_config.py b/configs/instagram_config.py index 73c45bc..a9f26b4 100644 --- a/configs/instagram_config.py +++ b/configs/instagram_config.py @@ -6,3 +6,4 @@ from dataclasses import dataclass class InstagramConfig: username: str password: str + session_file: str diff --git a/example.config.yaml b/example.config.yaml index b26f58f..7cd4ecb 100644 --- a/example.config.yaml +++ b/example.config.yaml @@ -71,6 +71,7 @@ secrets: instagram: username: "username" password: "password" + session_file: "instaloader.session" # <- default value google_sheets: # local filename: defaults to service_account.json, see https://gspread.readthedocs.io/en/latest/oauth2.html#for-bots-using-service-account From 54c572258c314d4c93019c7625f0affcec07f528 Mon Sep 17 00:00:00 2001 From: msramalho <19508417+msramalho@users.noreply.github.com> Date: Tue, 18 Oct 2022 17:46:40 +0100 Subject: [PATCH 005/190] fix tty --- archivers/base_archiver.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/archivers/base_archiver.py b/archivers/base_archiver.py index 96e0fbf..40ad861 100644 --- a/archivers/base_archiver.py +++ b/archivers/base_archiver.py @@ -215,7 +215,7 @@ class Archiver(ABC): cmd = [ "docker", "run", "-v", f"{browsertrix_home}:/crawls/", - "-it", + # "-it", # this leads to "the input device is not a TTY" "webrecorder/browsertrix-crawler", "crawl", "--url", url, "--scopeType", "page", From 4d2b7b404097ce6251ea04510ea3082a42cdffdd Mon Sep 17 00:00:00 2001 From: msramalho <19508417+msramalho@users.noreply.github.com> Date: Wed, 19 Oct 2022 11:27:17 +0100 Subject: [PATCH 006/190] reverse order of login attempts --- archivers/instagram_archiver.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/archivers/instagram_archiver.py b/archivers/instagram_archiver.py index d7a3989..a2b1147 100644 --- a/archivers/instagram_archiver.py +++ b/archivers/instagram_archiver.py @@ -24,16 +24,16 @@ class InstagramArchiver(Archiver): self.insta = instaloader.Instaloader(download_geotags=True, download_comments=True, compress_json=False, dirname_pattern=self.DOWNLOAD_FOLDER, filename_pattern="{date_utc}_UTC_{target}__{typename}") if config.instagram_config: try: - self.insta.login(config.instagram_config.username, config.instagram_config. - password) - #TODO: wait for this issue to be fixed https://github.com/instaloader/instaloader/issues/1758 - self.insta.save_session_to_file(config.instagram_config.session_file) + self.insta.load_session_from_file(config.instagram_config.username, config.instagram_config.session_file) except Exception as e: - logger.error(f"Unable to finish login (retrying from file): {e}\n{traceback.format_exc()}") + logger.error(f"Unable to login from session file: {e}\n{traceback.format_exc()}") try: - self.insta.load_session_from_file(config.instagram_config.username, config.instagram_config.session_file) + self.insta.login(config.instagram_config.username, config.instagram_config. + password) + #TODO: wait for this issue to be fixed https://github.com/instaloader/instaloader/issues/1758 + self.insta.save_session_to_file(config.instagram_config.session_file) except Exception as e2: - logger.error(f"Unable to login from session file: {e2}\n{traceback.format_exc()}") + logger.error(f"Unable to finish login (retrying from file): {e2}\n{traceback.format_exc()}") From ac4f1b6132efa0d038463d242d48cc9456af5023 Mon Sep 17 00:00:00 2001 From: msramalho <19508417+msramalho@users.noreply.github.com> Date: Wed, 19 Oct 2022 11:37:04 +0100 Subject: [PATCH 007/190] readme updates --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index e420629..dbaabc2 100644 --- a/README.md +++ b/README.md @@ -168,6 +168,7 @@ graph TD A(Archiver) -->|parent of| B(TelethonArchiver) A -->|parent of| C(TiktokArchiver) A -->|parent of| D(YoutubeDLArchiver) + A -->|parent of| D(InstagramArchiver) A -->|parent of| E(TelegramArchiver) A -->|parent of| F(TwitterArchiver) A -->|parent of| G(VkArchiver) From 22363cb8b9c22c4d549fd7ee5187de69875edee5 Mon Sep 17 00:00:00 2001 From: msramalho <19508417+msramalho@users.noreply.github.com> Date: Thu, 20 Oct 2022 11:59:23 +0100 Subject: [PATCH 008/190] adds information on browsertrix usage --- README.md | 4 ++-- example.config.yaml | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index dbaabc2..8bdc7d5 100644 --- a/README.md +++ b/README.md @@ -18,8 +18,8 @@ You also need: 3. [firefox](https://www.mozilla.org/en-US/firefox/new/) and [geckodriver](https://github.com/mozilla/geckodriver/releases) on a path folder like `/usr/local/bin`. 4. [fonts-noto](https://fonts.google.com/noto) to deal with multiple unicode characters during selenium/geckodriver's screenshots: `sudo apt install fonts-noto -y`. 5. Internet Archive credentials can be retrieved from https://archive.org/account/s3.php. -6. If you would like to take archival [WACZ](https://specs.webrecorder.net/wacz/1.1.1/) snapshots using [browsertrix-crawler](https://github.com/webrecorder/browsertrix-crawler) - in addition to screenshots you will need to install [Docker](https://www.docker.com/). +6. If you would like to take archival [WACZ](https://specs.webrecorder.net/wacz/1.1.1/) snapshots using [browsertrix-crawler](https://github.com/webrecorder/browsertrix-crawler) in addition to screenshots you will need to install [Docker](https://www.docker.com/). + 1. To improve the websites browsertrix can archive you can also create a custom profile by running `docker run -p 9222:9222 -p 9223:9223 -v $PWD/browsertrix/crawls/profiles:/crawls/profiles/ -it webrecorder/browsertrix-crawler create-login-profile --interactive --url "https://youtube.com"`, going to [http://localhost:9223/](http://localhost:9223/) and accepting the cookies prompt on youtube, and then navigating to other websites and logging in as per your needs, so as to access more publicly blocked content, and then specifying the created `profile.tar.gz` in your config file under `execution.browsertrix.profile`. ### Configuration file Configuration is done via a config.yaml file (see [example.config.yaml](example.config.yaml)) and some properties of that file can be overwritten via command line arguments. Here is the current result from running the `python auto_archive.py --help`: diff --git a/example.config.yaml b/example.config.yaml index 7cd4ecb..c6ad8f8 100644 --- a/example.config.yaml +++ b/example.config.yaml @@ -134,6 +134,6 @@ execution: screenshot: screenshot hash: hash wacz: wacz - # if you want the replaypage to work, make sure to allow CORS on your bucket + # if you want the replaypage to work, make sure to allow CORS on your bucket, see https://replayweb.page/docs/embedding#cors-restrictions replaywebpage: replaywebpage From 7a700acd8e59d3ce7c4bc53479c6270bc4721bd2 Mon Sep 17 00:00:00 2001 From: msramalho <19508417+msramalho@users.noreply.github.com> Date: Mon, 31 Oct 2022 10:35:01 +0000 Subject: [PATCH 009/190] hotfix for #65 --- archivers/base_archiver.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/archivers/base_archiver.py b/archivers/base_archiver.py index 40ad861..076b7ca 100644 --- a/archivers/base_archiver.py +++ b/archivers/base_archiver.py @@ -240,6 +240,14 @@ class Archiver(ABC): except Exception as e: logger.error(f"WACZ generation failed: {e}") return + try: + # TODO: is there a better way to manage the containers, like reusing? + # https://github.com/bellingcat/auto-archiver/issues/65 + cmd_clean_docker = 'docker rm $(docker stop $(docker ps -a -q --filter ancestor=webrecorder/browsertrix-crawler --format="{{.ID}}"))' + logger.info(f"Cleaning docker containers: {cmd_clean_docker}") + os.system(cmd_clean_docker) + except Exception as e: + logger.error(f"Could not clean dangling docker containers: {e}") filename = os.path.join(browsertrix_home, "collections", collection, f"{collection}.wacz") From 29e1872e872573de9cd9188d093b78023c1765b4 Mon Sep 17 00:00:00 2001 From: msramalho <19508417+msramalho@users.noreply.github.com> Date: Mon, 31 Oct 2022 10:41:27 +0000 Subject: [PATCH 010/190] fix: rm stopped containers only --- archivers/base_archiver.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/archivers/base_archiver.py b/archivers/base_archiver.py index 076b7ca..50fa588 100644 --- a/archivers/base_archiver.py +++ b/archivers/base_archiver.py @@ -243,7 +243,7 @@ class Archiver(ABC): try: # TODO: is there a better way to manage the containers, like reusing? # https://github.com/bellingcat/auto-archiver/issues/65 - cmd_clean_docker = 'docker rm $(docker stop $(docker ps -a -q --filter ancestor=webrecorder/browsertrix-crawler --format="{{.ID}}"))' + cmd_clean_docker = 'docker rm $(docker ps -a -q --filter ancestor=webrecorder/browsertrix-crawler --format="{{.ID}}")' logger.info(f"Cleaning docker containers: {cmd_clean_docker}") os.system(cmd_clean_docker) except Exception as e: From c8fa077df7d01de3c62c2b64e3fc2830dc9b22fd Mon Sep 17 00:00:00 2001 From: msramalho <19508417+msramalho@users.noreply.github.com> Date: Mon, 31 Oct 2022 17:10:55 +0000 Subject: [PATCH 011/190] docker initial files --- .dockerignore | 17 +++++++++++++++++ Dockerfile | 20 ++++++++++++++++++++ 2 files changed, 37 insertions(+) create mode 100644 .dockerignore create mode 100644 Dockerfile diff --git a/.dockerignore b/.dockerignore new file mode 100644 index 0000000..a4d2bbb --- /dev/null +++ b/.dockerignore @@ -0,0 +1,17 @@ +logs/ +browsertrix-tmp/ +tmp*/ +temp/ +.DS_Store +__pycache__/ +local_archive/ +config*.json +config.json +*.env +credentials.json +secrets/ +instaloader/ +instaloader.session +vk_config*.json +anon* +geckodriver.log diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..61a82c2 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,20 @@ +From python:3.10 + +WORKDIR /usr/src/app + +COPY . . + + +# TODO: use custom ffmpeg builds instead of apt-get install +RUN pip install --upgrade pip && \ + pip install pipenv && \ + apt-get update && \ + apt-get install -y gcc ffmpeg fonts-noto firefox-esr && \ + wget https://github.com/mozilla/geckodriver/releases/download/v0.32.0/geckodriver-v0.32.0-linux64.tar.gz && \ + tar -xvzf geckodriver* -C /usr/bin && \ + chmod +x /usr/bin/geckodriver && \ + rm geckodriver-v* && \ + export PATH=$PATH:/usr/bin/ && \ + pipenv install --python=3.10 + +CMD ["pipenv", "run", "python", "auto_archive.py"] \ No newline at end of file From a9df992f6663b8f51672186f8c788a682df7659a Mon Sep 17 00:00:00 2001 From: msramalho <19508417+msramalho@users.noreply.github.com> Date: Wed, 2 Nov 2022 16:51:32 +0000 Subject: [PATCH 012/190] WiP --- Dockerfile | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/Dockerfile b/Dockerfile index 61a82c2..b2de36a 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,6 +1,6 @@ From python:3.10 -WORKDIR /usr/src/app +WORKDIR /app COPY . . @@ -9,12 +9,16 @@ COPY . . RUN pip install --upgrade pip && \ pip install pipenv && \ apt-get update && \ - apt-get install -y gcc ffmpeg fonts-noto firefox-esr && \ - wget https://github.com/mozilla/geckodriver/releases/download/v0.32.0/geckodriver-v0.32.0-linux64.tar.gz && \ - tar -xvzf geckodriver* -C /usr/bin && \ - chmod +x /usr/bin/geckodriver && \ + apt-get install -y gcc ffmpeg fonts-noto firefox-esr + +RUN wget https://github.com/mozilla/geckodriver/releases/download/v0.32.0/geckodriver-v0.32.0-linux64.tar.gz && \ + tar -xvzf geckodriver* -C /usr/local/bin && \ + chmod +x /usr/local/bin/geckodriver && \ rm geckodriver-v* && \ - export PATH=$PATH:/usr/bin/ && \ pipenv install --python=3.10 -CMD ["pipenv", "run", "python", "auto_archive.py"] \ No newline at end of file +# CMD ["pipenv", "run", "python", "auto_archive.py"] +ENTRYPOINT ["pipenv", "run", "python", "auto_archive.py"] + +# should be executed with 2 volumes +# docker run -v /var/run/docker.sock:/var/run/docker.sock -v $PWD/secrets:/app/ aa --help \ No newline at end of file From 50e03ba565918ecd4b2c3bf56b62ea64d141a676 Mon Sep 17 00:00:00 2001 From: msramalho <19508417+msramalho@users.noreply.github.com> Date: Wed, 2 Nov 2022 16:59:44 +0000 Subject: [PATCH 013/190] closes #65 with simpler solution --- archivers/base_archiver.py | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/archivers/base_archiver.py b/archivers/base_archiver.py index 50fa588..3dc5ba1 100644 --- a/archivers/base_archiver.py +++ b/archivers/base_archiver.py @@ -214,6 +214,7 @@ class Archiver(ABC): browsertrix_home = os.path.join(os.getcwd(), "browsertrix-tmp") cmd = [ "docker", "run", + "--rm", # delete container once it has completed running "-v", f"{browsertrix_home}:/crawls/", # "-it", # this leads to "the input device is not a TTY" "webrecorder/browsertrix-crawler", "crawl", @@ -240,14 +241,6 @@ class Archiver(ABC): except Exception as e: logger.error(f"WACZ generation failed: {e}") return - try: - # TODO: is there a better way to manage the containers, like reusing? - # https://github.com/bellingcat/auto-archiver/issues/65 - cmd_clean_docker = 'docker rm $(docker ps -a -q --filter ancestor=webrecorder/browsertrix-crawler --format="{{.ID}}")' - logger.info(f"Cleaning docker containers: {cmd_clean_docker}") - os.system(cmd_clean_docker) - except Exception as e: - logger.error(f"Could not clean dangling docker containers: {e}") filename = os.path.join(browsertrix_home, "collections", collection, f"{collection}.wacz") From 629cd586db7e8119f9c6f6d4daed5967d1c71c9c Mon Sep 17 00:00:00 2001 From: msramalho <19508417+msramalho@users.noreply.github.com> Date: Tue, 8 Nov 2022 13:59:09 +0000 Subject: [PATCH 014/190] adds session_file for missing archivers --- configs/config.py | 7 +++++-- configs/telethon_config.py | 1 + configs/vk_config.py | 1 + example.config.yaml | 4 ++++ 4 files changed, 11 insertions(+), 2 deletions(-) diff --git a/configs/config.py b/configs/config.py index 372b3d7..01b8173 100644 --- a/configs/config.py +++ b/configs/config.py @@ -34,6 +34,7 @@ class Config: def __init__(self): self.parser = self.get_argument_parser() self.folder = "" + self.is_docker = bool(os.environ.get("IS_DOCKER", 0)) def parse(self): self.args = self.parser.parse_args() @@ -152,7 +153,8 @@ class Config: self.telegram_config = TelethonConfig( api_id=secrets["telegram"]["api_id"], api_hash=secrets["telegram"]["api_hash"], - bot_token=secrets["telegram"].get("bot_token", None) + bot_token=secrets["telegram"].get("bot_token", None), + session_file=secrets["telegram"].get("session_file", "./anon") ) else: self.telegram_config = None @@ -175,7 +177,8 @@ class Config: if "vk" in secrets: self.vk_config = VkConfig( username=secrets["vk"]["username"], - password=secrets["vk"]["password"] + password=secrets["vk"]["password"], + session_file=secrets["vk"].get("session_file", "./vk_config.v2.json") ) else: self.vk_config = None diff --git a/configs/telethon_config.py b/configs/telethon_config.py index 3099bb5..111c7bd 100644 --- a/configs/telethon_config.py +++ b/configs/telethon_config.py @@ -7,3 +7,4 @@ class TelethonConfig: api_id: str api_hash: str bot_token: str + session_file: str diff --git a/configs/vk_config.py b/configs/vk_config.py index db2e61c..4c3472c 100644 --- a/configs/vk_config.py +++ b/configs/vk_config.py @@ -6,3 +6,4 @@ from dataclasses import dataclass class VkConfig: username: str password: str + session_file: str diff --git a/example.config.yaml b/example.config.yaml index c6ad8f8..857265b 100644 --- a/example.config.yaml +++ b/example.config.yaml @@ -50,6 +50,8 @@ secrets: api_hash: your API hash # optional, but allows access to more content such as large videos, talk to @botfather bot_token: your bot-token + # optional, defaults to ./anon, records the telegram login session for future usage + session_file: "secrets/anon" # twitter configuration - API V2 only # if you don't provide credentials the less-effective unofficial TwitterArchiver will be used instead @@ -66,6 +68,8 @@ secrets: vk: username: "phone number or email" password: "password" + # optional, defaults to ./vk_config.v2.json, records VK login session for future usage + session_file: "secrets/vk_config.v2.json" # instagram credentials instagram: From 09f47383a3478ff43ff01aa659316782154167b6 Mon Sep 17 00:00:00 2001 From: msramalho <19508417+msramalho@users.noreply.github.com> Date: Tue, 8 Nov 2022 13:59:35 +0000 Subject: [PATCH 015/190] dockerfile improvements --- Dockerfile | 30 ++++++++++++++++++++---------- 1 file changed, 20 insertions(+), 10 deletions(-) diff --git a/Dockerfile b/Dockerfile index b2de36a..a9b4d7a 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,24 +1,34 @@ +# stage 1 - all dependencies From python:3.10 WORKDIR /app -COPY . . - - # TODO: use custom ffmpeg builds instead of apt-get install RUN pip install --upgrade pip && \ pip install pipenv && \ apt-get update && \ - apt-get install -y gcc ffmpeg fonts-noto firefox-esr - -RUN wget https://github.com/mozilla/geckodriver/releases/download/v0.32.0/geckodriver-v0.32.0-linux64.tar.gz && \ + apt-get install -y gcc ffmpeg fonts-noto firefox-esr && \ + wget https://github.com/mozilla/geckodriver/releases/download/v0.32.0/geckodriver-v0.32.0-linux64.tar.gz && \ tar -xvzf geckodriver* -C /usr/local/bin && \ chmod +x /usr/local/bin/geckodriver && \ - rm geckodriver-v* && \ - pipenv install --python=3.10 + rm geckodriver-v* + + +# install docker for WACZ +RUN curl -fsSL https://get.docker.com | sh + +# RUN git clone https://github.com/bellingcat/auto-archiver +# TODO: avoid copying unnecessary files, including .git +# COPY ./src/ . +COPY Pipfile Pipfile.lock ./ +RUN pipenv install --python=3.10 --system --deploy +# TODO: to avoid copying pipfile lock it should be on the .dockerignore +ENV IS_DOCKER=1 +COPY . . # CMD ["pipenv", "run", "python", "auto_archive.py"] -ENTRYPOINT ["pipenv", "run", "python", "auto_archive.py"] +ENTRYPOINT ["python", "auto_archive.py"] +# ENTRYPOINT ["docker-entrypoint.sh"] # should be executed with 2 volumes -# docker run -v /var/run/docker.sock:/var/run/docker.sock -v $PWD/secrets:/app/ aa --help \ No newline at end of file +# docker run -v /var/run/docker.sock:/var/run/docker.sock -v $PWD/secrets:/app/secrets -v $PWD/local_archive:/app/local_archive aa --help \ No newline at end of file From a8f7055696ee14f41a67e5487a6883c3872f274d Mon Sep 17 00:00:00 2001 From: msramalho <19508417+msramalho@users.noreply.github.com> Date: Tue, 8 Nov 2022 13:59:59 +0000 Subject: [PATCH 016/190] reduces uncontrolled exceptions --- Pipfile.lock | 332 ++++++++++++++++++++----------------- archivers/base_archiver.py | 7 +- 2 files changed, 183 insertions(+), 156 deletions(-) diff --git a/Pipfile.lock b/Pipfile.lock index e2d1b1b..6aac097 100644 --- a/Pipfile.lock +++ b/Pipfile.lock @@ -42,10 +42,10 @@ }, "authlib": { "hashes": [ - "sha256:b83cf6360c8e92b0e9df0d1f32d675790bcc4e3c03977499b1eed24dcdef4252", - "sha256:ecf4a7a9f2508c0bb07e93a752dd3c495cfaffc20e864ef0ffc95e3f40d2abaf" + "sha256:2988fdf7d0a5c416f5a37ca4b1e7cee360094940229bc97909aed25880326c72", + "sha256:6de4508ba8125e438a35bcd910d55df7087dccd3dd8517095c2bd9853c372ec1" ], - "version": "==0.15.5" + "version": "==0.15.6" }, "beautifulsoup4": { "hashes": [ @@ -57,19 +57,19 @@ }, "boto3": { "hashes": [ - "sha256:3c6cc4e9e38cf4523267f89eb90c0b6084fa415cb4f44e3bf0cad6199340cc92", - "sha256:d28bcb98aee4d333b163c55b98341627d933dbf088832f7fc050893617be7dac" + "sha256:3b0fa19390895e664045713f2e47e63ad29c9f98b7bee6836dec7124953e48b8", + "sha256:9feb98e045736f943c2099d955415cfe44133e03d8e2d7581d2e5dc74d0ed064" ], "index": "pypi", - "version": "==1.24.92" + "version": "==1.26.1" }, "botocore": { "hashes": [ - "sha256:70cf2cb04968794ed4688cc3b07874f6f4c932e325611be4e693a995fdb481be", - "sha256:b49c34b80c782625905be75e669da4b42a99f074e0aa3007e15ccc6955682a07" + "sha256:75c65130ffab527d0a3d948c6d87eb8eac210e079e1ff2768c66484be57bb77c", + "sha256:e38b7cdce927cefabe45608dde61660b76458fba6624240dcdb6c4b8453d17f7" ], "markers": "python_version >= '3.7'", - "version": "==1.27.92" + "version": "==1.29.1" }, "brotli": { "hashes": [ @@ -159,7 +159,7 @@ "sha256:84c85a9078b11105f04f3036a9482ae10e4621616db313fe045dd24743a0820d", "sha256:fe86415d55e84719d75f8b69414f6438ac3547d2078ab91b67e779ef69378412" ], - "markers": "python_full_version >= '3.6.0'", + "markers": "python_version >= '3.6'", "version": "==2022.6.15" }, "cffi": { @@ -263,59 +263,59 @@ }, "cryptography": { "hashes": [ - "sha256:0297ffc478bdd237f5ca3a7dc96fc0d315670bfa099c04dc3a4a2172008a405a", - "sha256:10d1f29d6292fc95acb597bacefd5b9e812099d75a6469004fd38ba5471a977f", - "sha256:16fa61e7481f4b77ef53991075de29fc5bacb582a1244046d2e8b4bb72ef66d0", - "sha256:194044c6b89a2f9f169df475cc167f6157eb9151cc69af8a2a163481d45cc407", - "sha256:1db3d807a14931fa317f96435695d9ec386be7b84b618cc61cfa5d08b0ae33d7", - "sha256:3261725c0ef84e7592597606f6583385fed2a5ec3909f43bc475ade9729a41d6", - "sha256:3b72c360427889b40f36dc214630e688c2fe03e16c162ef0aa41da7ab1455153", - "sha256:3e3a2599e640927089f932295a9a247fc40a5bdf69b0484532f530471a382750", - "sha256:3fc26e22840b77326a764ceb5f02ca2d342305fba08f002a8c1f139540cdfaad", - "sha256:5067ee7f2bce36b11d0e334abcd1ccf8c541fc0bbdaf57cdd511fdee53e879b6", - "sha256:52e7bee800ec869b4031093875279f1ff2ed12c1e2f74923e8f49c916afd1d3b", - "sha256:64760ba5331e3f1794d0bcaabc0d0c39e8c60bf67d09c93dc0e54189dfd7cfe5", - "sha256:765fa194a0f3372d83005ab83ab35d7c5526c4e22951e46059b8ac678b44fa5a", - "sha256:79473cf8a5cbc471979bd9378c9f425384980fcf2ab6534b18ed7d0d9843987d", - "sha256:896dd3a66959d3a5ddcfc140a53391f69ff1e8f25d93f0e2e7830c6de90ceb9d", - "sha256:89ed49784ba88c221756ff4d4755dbc03b3c8d2c5103f6d6b4f83a0fb1e85294", - "sha256:ac7e48f7e7261207d750fa7e55eac2d45f720027d5703cd9007e9b37bbb59ac0", - "sha256:ad7353f6ddf285aeadfaf79e5a6829110106ff8189391704c1d8801aa0bae45a", - "sha256:b0163a849b6f315bf52815e238bc2b2346604413fa7c1601eea84bcddb5fb9ac", - "sha256:b6c9b706316d7b5a137c35e14f4103e2115b088c412140fdbd5f87c73284df61", - "sha256:c2e5856248a416767322c8668ef1845ad46ee62629266f84a8f007a317141013", - "sha256:ca9f6784ea96b55ff41708b92c3f6aeaebde4c560308e5fbbd3173fbc466e94e", - "sha256:d1a5bd52d684e49a36582193e0b89ff267704cd4025abefb9e26803adeb3e5fb", - "sha256:d3971e2749a723e9084dd507584e2a2761f78ad2c638aa31e80bc7a15c9db4f9", - "sha256:d4ef6cc305394ed669d4d9eebf10d3a101059bdcf2669c366ec1d14e4fb227bd", - "sha256:d9e69ae01f99abe6ad646947bba8941e896cb3aa805be2597a0400e0764b5818" + "sha256:068147f32fa662c81aebab95c74679b401b12b57494872886eb5c1139250ec5d", + "sha256:06fc3cc7b6f6cca87bd56ec80a580c88f1da5306f505876a71c8cfa7050257dd", + "sha256:25c1d1f19729fb09d42e06b4bf9895212292cb27bb50229f5aa64d039ab29146", + "sha256:402852a0aea73833d982cabb6d0c3bb582c15483d29fb7085ef2c42bfa7e38d7", + "sha256:4e269dcd9b102c5a3d72be3c45d8ce20377b8076a43cbed6f660a1afe365e436", + "sha256:5419a127426084933076132d317911e3c6eb77568a1ce23c3ac1e12d111e61e0", + "sha256:554bec92ee7d1e9d10ded2f7e92a5d70c1f74ba9524947c0ba0c850c7b011828", + "sha256:5e89468fbd2fcd733b5899333bc54d0d06c80e04cd23d8c6f3e0542358c6060b", + "sha256:65535bc550b70bd6271984d9863a37741352b4aad6fb1b3344a54e6950249b55", + "sha256:6ab9516b85bebe7aa83f309bacc5f44a61eeb90d0b4ec125d2d003ce41932d36", + "sha256:6addc3b6d593cd980989261dc1cce38263c76954d758c3c94de51f1e010c9a50", + "sha256:728f2694fa743a996d7784a6194da430f197d5c58e2f4e278612b359f455e4a2", + "sha256:785e4056b5a8b28f05a533fab69febf5004458e20dad7e2e13a3120d8ecec75a", + "sha256:78cf5eefac2b52c10398a42765bfa981ce2372cbc0457e6bf9658f41ec3c41d8", + "sha256:7f836217000342d448e1c9a342e9163149e45d5b5eca76a30e84503a5a96cab0", + "sha256:8d41a46251bf0634e21fac50ffd643216ccecfaf3701a063257fe0b2be1b6548", + "sha256:984fe150f350a3c91e84de405fe49e688aa6092b3525f407a18b9646f6612320", + "sha256:9b24bcff7853ed18a63cfb0c2b008936a9554af24af2fb146e16d8e1aed75748", + "sha256:b1b35d9d3a65542ed2e9d90115dfd16bbc027b3f07ee3304fc83580f26e43249", + "sha256:b1b52c9e5f8aa2b802d48bd693190341fae201ea51c7a167d69fc48b60e8a959", + "sha256:bbf203f1a814007ce24bd4d51362991d5cb90ba0c177a9c08825f2cc304d871f", + "sha256:be243c7e2bfcf6cc4cb350c0d5cdf15ca6383bbcb2a8ef51d3c9411a9d4386f0", + "sha256:bfbe6ee19615b07a98b1d2287d6a6073f734735b49ee45b11324d85efc4d5cbd", + "sha256:c46837ea467ed1efea562bbeb543994c2d1f6e800785bd5a2c98bc096f5cb220", + "sha256:dfb4f4dd568de1b6af9f4cda334adf7d72cf5bc052516e1b2608b683375dd95c", + "sha256:ed7b00096790213e09eb11c97cc6e2b757f15f3d2f85833cd2d3ec3fe37c1722" ], - "markers": "python_full_version >= '3.6.0'", - "version": "==38.0.1" + "markers": "python_version >= '3.6'", + "version": "==38.0.3" }, "dataclasses-json": { "hashes": [ "sha256:bc285b5f892094c3a53d558858a88553dd6a61a11ab1a8128a0e554385dcc5dd", "sha256:c2c11bc8214fbf709ffc369d11446ff6945254a7f09128154a7620613d8fda90" ], - "markers": "python_full_version >= '3.6.0'", + "markers": "python_version >= '3.6'", "version": "==0.5.7" }, "dateparser": { "hashes": [ - "sha256:038196b1f12c7397e38aad3d61588833257f6f552baa63a1499e6987fa8d42d9", - "sha256:9600874312ff28a41f96ec7ccdc73be1d1c44435719da47fea3339d55ff5a628" + "sha256:711f7eef6d431225bec56c00e386af3f6a47083276253375bdae1ae6c8d23d4a", + "sha256:ae7a7de30f26983d09fff802c1f9d35d54e1c11d7ab52ae904a1f3fc037ecba5" ], "index": "pypi", - "version": "==1.1.1" + "version": "==1.1.3" }, "exceptiongroup": { "hashes": [ - "sha256:2e3c3fc1538a094aab74fad52d6c33fc94de3dfee3ee01f187c0e0c72aec5337", - "sha256:9086a4a21ef9b31c72181c77c040a074ba0889ee56a7b289ff0afb0d97655f96" + "sha256:2ac84b496be68464a2da60da518af3785fff8b7ec0d090a581604bc870bdee41", + "sha256:affbabf13fb6e98988c38d9c5650e701569fe3c1de3233cfb61c5f33774690ad" ], "markers": "python_version < '3.11'", - "version": "==1.0.0rc9" + "version": "==1.0.0" }, "ffmpeg-python": { "hashes": [ @@ -358,19 +358,19 @@ }, "google-api-python-client": { "hashes": [ - "sha256:0dc4c967a5c795e981af01340f1bd22173a986534de968b5456cb208ed6775a6", - "sha256:90545cd71969f8bcf15a6362c2a8c44c38b94ec35a88cfd60cf2c0df68a5eb74" + "sha256:2c6611530308b3f931dcf1360713aa3a20cf465d0bf2bac65f2ec99e8c9860de", + "sha256:b8a0ca8454ad57bc65199044717d3d214197ae1e2d666426bbcd4021b36762e0" ], "index": "pypi", - "version": "==2.64.0" + "version": "==2.65.0" }, "google-auth": { "hashes": [ - "sha256:9352dd6394093169157e6971526bab9a2799244d68a94a4a609f0dd751ef6f5e", - "sha256:99510e664155f1a3c0396a076b5deb6367c52ea04d280152c85ac7f51f50eb42" + "sha256:1ad5b0e6eba5f69645971abb3d2c197537d5914070a8c6d30299dfdb07c5c700", + "sha256:cf24817855d874ede2efd071aa22125445f555de1685b739a9782fcf408c2a3d" ], "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4, 3.5'", - "version": "==2.13.0" + "version": "==2.14.0" }, "google-auth-httplib2": { "hashes": [ @@ -382,11 +382,11 @@ }, "google-auth-oauthlib": { "hashes": [ - "sha256:307d21918d61a0741882ad1fd001c67e68ad81206451d05fc4d26f79de56fc90", - "sha256:9e8ff4ed2b21c174a2d6cc2172c698dbf0b1f686509774c663a83c495091fe09" + "sha256:53019edbde83e08ff0740eefc5bded7e26a289941d12e7ae1f0f5bacf2faa031", + "sha256:db11bce4b3effc99b518ec22a2903470e0853c0c92be57694e3684e738d22513" ], "index": "pypi", - "version": "==0.5.3" + "version": "==0.7.0" }, "googleapis-common-protos": { "hashes": [ @@ -398,11 +398,11 @@ }, "gspread": { "hashes": [ - "sha256:0fe52bec73cc232abadfbc2a999e30201bc5cb0c2728ec00fcfdf38f6f669375", - "sha256:9fca855173fdb2e648b3da9e7bbffb83601bfd7c7131d44fa781df84c689e7fc" + "sha256:41f7a416425f1ec5a1b677f49b8fbf599102766c27ed7be6601a58c9a1550ebc", + "sha256:d3bbff4b7aad0fc2c986458e148537a02fe7b46e7162f41f3a42392bfa2adb89" ], "index": "pypi", - "version": "==5.6.0" + "version": "==5.6.2" }, "h11": { "hashes": [ @@ -414,11 +414,11 @@ }, "httplib2": { "hashes": [ - "sha256:58a98e45b4b1a48273073f905d2961666ecf0fbac4250ea5b47aef259eb5c585", - "sha256:8b6a905cb1c79eefd03f8669fd993c36dc341f7c558f056cb5a33b5c2f458543" + "sha256:987c8bb3eb82d3fa60c68699510a692aa2ad9c4bd4f123e51dfb1488c14cdd01", + "sha256:fc144f091c7286b82bec71bdbd9b27323ba709cc612568d3000893bfd9cb4b34" ], "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'", - "version": "==0.20.4" + "version": "==0.21.0" }, "idna": { "hashes": [ @@ -632,7 +632,7 @@ "sha256:8139f29aac13e25d502680e9e19963e83f16838d48a0d71c287fe40e7067fbca", "sha256:9859c40929662bec5d64f34d01c99e093149682a3f38915dc0655d5a633dd918" ], - "markers": "python_full_version >= '3.6.0'", + "markers": "python_version >= '3.6'", "version": "==3.2.2" }, "outcome": { @@ -648,28 +648,28 @@ "sha256:dd47c42927d89ab911e606518907cc2d3a1f38bbd026385970643f9c5b8ecfeb", "sha256:ef103e05f519cdc783ae24ea4e2e0f508a9c99b2d4969652eed6a2e1ea5bd522" ], - "markers": "python_full_version >= '3.6.0'", + "markers": "python_version >= '3.6'", "version": "==21.3" }, "protobuf": { "hashes": [ - "sha256:3ec85328a35a16463c6f419dbce3c0fc42b3e904d966f17f48bae39597c7a543", - "sha256:58b81358ec6c0b5d50df761460ae2db58405c063fd415e1101209221a0a810e1", - "sha256:71d9dba03ed3432c878a801e2ea51e034b0ea01cf3a4344fb60166cb5f6c8757", - "sha256:8066322588d4b499869bf9f665ebe448e793036b552f68c585a9b28f1e393f66", - "sha256:8e09d1916386eca1ef1353767b6efcebc0a6859ed7f73cb7fb974feba3184830", - "sha256:9643684232b6b340b5e63bb69c9b4904cdd39e4303d498d1a92abddc7e895b7f", - "sha256:9e355f2a839d9930d83971b9f562395e13493f0e9211520f8913bd11efa53c02", - "sha256:a74d96cd960b87b4b712797c741bb3ea3a913f5c2dc4b6cbe9c0f8360b75297d", - "sha256:b019c79e23a80735cc8a71b95f76a49a262f579d6b84fd20a0b82279f40e2cc1", - "sha256:c7cb105d69a87416bd9023e64324e1c089593e6dae64d2536f06bcbe49cd97d8", - "sha256:ca200645d6235ce0df3ccfdff1567acbab35c4db222a97357806e015f85b5744", - "sha256:d3f89ccf7182293feba2de2739c8bf34fed1ed7c65a5cf987be00311acac57c1", - "sha256:db9056b6a11cb5131036d734bcbf91ef3ef9235d6b681b2fc431cbfe5a7f2e56", - "sha256:f370c0a71712f8965023dd5b13277444d3cdfecc96b2c778b0e19acbfd60df6e" + "sha256:2c9c2ed7466ad565f18668aa4731c535511c5d9a40c6da39524bccf43e441719", + "sha256:48e2cd6b88c6ed3d5877a3ea40df79d08374088e89bedc32557348848dff250b", + "sha256:5b0834e61fb38f34ba8840d7dcb2e5a2f03de0c714e0293b3963b79db26de8ce", + "sha256:61f21493d96d2a77f9ca84fefa105872550ab5ef71d21c458eb80edcf4885a99", + "sha256:6e0be9f09bf9b6cf497b27425487706fa48c6d1632ddd94dab1a5fe11a422392", + "sha256:6e312e280fbe3c74ea9e080d9e6080b636798b5e3939242298b591064470b06b", + "sha256:7eb8f2cc41a34e9c956c256e3ac766cf4e1a4c9c925dc757a41a01be3e852965", + "sha256:84ea107016244dfc1eecae7684f7ce13c788b9a644cd3fca5b77871366556444", + "sha256:9227c14010acd9ae7702d6467b4625b6fe853175a6b150e539b21d2b2f2b409c", + "sha256:a419cc95fca8694804709b8c4f2326266d29659b126a93befe210f5bbc772536", + "sha256:a7d0ea43949d45b836234f4ebb5ba0b22e7432d065394b532cdca8f98415e3cf", + "sha256:b5ab0b8918c136345ff045d4b3d5f719b505b7c8af45092d7f45e304f55e50a1", + "sha256:e575c57dc8b5b2b2caa436c16d44ef6981f2235eb7179bfc847557886376d740", + "sha256:f9eae277dd240ae19bb06ff4e2346e771252b0e619421965504bd1b1bba7c5fa" ], "markers": "python_version >= '3.7'", - "version": "==4.21.7" + "version": "==4.21.9" }, "pyaes": { "hashes": [ @@ -761,7 +761,7 @@ "sha256:56a8508ae95f98e2b9bdf93a6be5ae3f7d8af858b43e02c5a2ff083726be40c1", "sha256:f643f331ab57ba3c9d89212ee4a2dabc6e94f117cf4eefde99a0574720d14c42" ], - "markers": "python_full_version >= '3.6.0'", + "markers": "python_version >= '3.6'", "version": "==2.13.0" }, "pyparsing": { @@ -814,9 +814,10 @@ }, "pytz": { "hashes": [ - "sha256:335ab46900b1465e714b4fda4963d87363264eb662aab5e65da039c25f1f5b22" + "sha256:222439474e9c98fced559f1709d89e6c9cbf8d79c794ff3eb9f8800064291427", + "sha256:e89512406b793ca39f5971bc999cc538ce125c0e51c27941bef4568b460095e2" ], - "version": "==2022.5" + "version": "==2022.6" }, "pytz-deprecation-shim": { "hashes": [ @@ -949,7 +950,7 @@ "sha256:fbc88d3ba402b5d041d204ec2449c4078898f89c4a6e6f0ed1c1a510ef1e221d", "sha256:fbd3fe37353c62fd0eb19fb76f78aa693716262bcd5f9c14bb9e5aca4b3f0dc4" ], - "markers": "python_full_version >= '3.6.0'", + "markers": "python_version >= '3.6'", "version": "==2022.3.2" }, "requests": { @@ -970,11 +971,11 @@ }, "requests-toolbelt": { "hashes": [ - "sha256:64c6b8c51b515d123f9f708a29743f44eb70c4479440641ed2df8c4dea56d985", - "sha256:f695d6207931200b46c8ef6addbc8a921fb5d77cc4cd209c2e7d39293fcd2b30" + "sha256:18565aa58116d9951ac39baa288d3adb5b3ff975c4f25eee78555d89e8f247f7", + "sha256:62e09f7ff5ccbda92772a29f394a49c3ad6cb181d568b1337626b2abb628a63d" ], "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'", - "version": "==0.10.0" + "version": "==0.10.1" }, "rich": { "hashes": [ @@ -989,7 +990,7 @@ "sha256:90260d9058e514786967344d0ef75fa8727eed8a7d2e43ce9f4bcf1b536174f7", "sha256:e38464a49c6c85d7f1351b0126661487a7e0a14a50f1675ec50eb34d4f20ef21" ], - "markers": "python_version < '4' and python_full_version >= '3.6.0'", + "markers": "python_version >= '3.6' and python_version < '4'", "version": "==4.9" }, "s3transfer": { @@ -1043,7 +1044,7 @@ "sha256:3b2503d3c7084a42b1ebd08116e5f81aadfaea95863628c80a3b774a11b7c759", "sha256:fc53893b3da2c33de295667a0e19f078c14bf86544af307354de5fcf12a3f30d" ], - "markers": "python_full_version >= '3.6.0'", + "markers": "python_version >= '3.6'", "version": "==2.3.2.post1" }, "telethon": { @@ -1109,18 +1110,18 @@ }, "tzdata": { "hashes": [ - "sha256:323161b22b7802fdc78f20ca5f6073639c64f1a7227c40cd3e19fd1d0ce6650a", - "sha256:e15b2b3005e2546108af42a0eb4ccab4d9e225e2dfbf4f77aad50c70a4b1f3ab" + "sha256:04a680bdc5b15750c39c12a448885a51134a27ec9af83667663f0b3a1bf3f342", + "sha256:91f11db4503385928c15598c98573e3af07e7229181bee5375bd30f1695ddcae" ], - "markers": "python_full_version >= '3.6.0'", - "version": "==2022.5" + "markers": "python_version >= '3.6'", + "version": "==2022.6" }, "tzlocal": { "hashes": [ "sha256:89885494684c929d9191c57aa27502afc87a579be5cdd3225c77c463ea043745", "sha256:ee5842fa3a795f023514ac2d801c4a81d1743bbe642e3940143326b3a00addd7" ], - "markers": "python_full_version >= '3.6.0'", + "markers": "python_version >= '3.6'", "version": "==4.2" }, "uritemplate": { @@ -1128,7 +1129,7 @@ "sha256:4346edfc5c3b79f694bccd6d6099a322bbeb628dbf2cd86eea55a456ce5124f0", "sha256:830c08b8d99bdd312ea4ead05994a38e8936266f84b9a7878232db50b044e02e" ], - "markers": "python_full_version >= '3.6.0'", + "markers": "python_version >= '3.6'", "version": "==4.1.1" }, "urllib3": { @@ -1148,65 +1149,86 @@ }, "vk-url-scraper": { "hashes": [ - "sha256:7caf8d788fc268d311b13c06ff0cbd9413dd8978f463af970459b9e7e2f42ba5", - "sha256:c4593d86b5096e75e2845e4838f46ce2cf0ac34b2fe1c4476d2eeb6744b18a11" + "sha256:3718a569e431c9c2bc7e92e9156e25b7112dc0b9b461c8001fa481a00ccbd3bc", + "sha256:baebe32bb29d6f188d849f38ecc43d04d5b5bad05db7f31dfdbe450f684042f0" ], "index": "pypi", - "version": "==0.3.5" + "version": "==0.3.8" }, "websockets": { "hashes": [ - "sha256:07cdc0a5b2549bcfbadb585ad8471ebdc7bdf91e32e34ae3889001c1c106a6af", - "sha256:210aad7fdd381c52e58777560860c7e6110b6174488ef1d4b681c08b68bf7f8c", - "sha256:28dd20b938a57c3124028680dc1600c197294da5db4292c76a0b48efb3ed7f76", - "sha256:2f94fa3ae454a63ea3a19f73b95deeebc9f02ba2d5617ca16f0bbdae375cda47", - "sha256:31564a67c3e4005f27815634343df688b25705cccb22bc1db621c781ddc64c69", - "sha256:347974105bbd4ea068106ec65e8e8ebd86f28c19e529d115d89bd8cc5cda3079", - "sha256:379e03422178436af4f3abe0aa8f401aa77ae2487843738542a75faf44a31f0c", - "sha256:3eda1cb7e9da1b22588cefff09f0951771d6ee9fa8dbe66f5ae04cc5f26b2b55", - "sha256:51695d3b199cd03098ae5b42833006a0f43dc5418d3102972addc593a783bc02", - "sha256:54c000abeaff6d8771a4e2cef40900919908ea7b6b6a30eae72752607c6db559", - "sha256:5b936bf552e4f6357f5727579072ff1e1324717902127ffe60c92d29b67b7be3", - "sha256:6075fd24df23133c1b078e08a9b04a3bc40b31a8def4ee0b9f2c8865acce913e", - "sha256:661f641b44ed315556a2fa630239adfd77bd1b11cb0b9d96ed8ad90b0b1e4978", - "sha256:6ea6b300a6bdd782e49922d690e11c3669828fe36fc2471408c58b93b5535a98", - "sha256:6ed1d6f791eabfd9808afea1e068f5e59418e55721db8b7f3bfc39dc831c42ae", - "sha256:7934e055fd5cd9dee60f11d16c8d79c4567315824bacb1246d0208a47eca9755", - "sha256:7ab36e17af592eec5747c68ef2722a74c1a4a70f3772bc661079baf4ae30e40d", - "sha256:7f6d96fdb0975044fdd7953b35d003b03f9e2bcf85f2d2cf86285ece53e9f991", - "sha256:83e5ca0d5b743cde3d29fda74ccab37bdd0911f25bd4cdf09ff8b51b7b4f2fa1", - "sha256:85506b3328a9e083cc0a0fb3ba27e33c8db78341b3eb12eb72e8afd166c36680", - "sha256:8af75085b4bc0b5c40c4a3c0e113fa95e84c60f4ed6786cbb675aeb1ee128247", - "sha256:8b1359aba0ff810d5830d5ab8e2c4a02bebf98a60aa0124fb29aa78cfdb8031f", - "sha256:8fbd7d77f8aba46d43245e86dd91a8970eac4fb74c473f8e30e9c07581f852b2", - "sha256:907e8247480f287aa9bbc9391bd6de23c906d48af54c8c421df84655eef66af7", - "sha256:93d5ea0b5da8d66d868b32c614d2b52d14304444e39e13a59566d4acb8d6e2e4", - "sha256:97bc9d41e69a7521a358f9b8e44871f6cdeb42af31815c17aed36372d4eec667", - "sha256:994cdb1942a7a4c2e10098d9162948c9e7b235df755de91ca33f6e0481366fdb", - "sha256:a141de3d5a92188234afa61653ed0bbd2dde46ad47b15c3042ffb89548e77094", - "sha256:a1e15b230c3613e8ea82c9fc6941b2093e8eb939dd794c02754d33980ba81e36", - "sha256:aad5e300ab32036eb3fdc350ad30877210e2f51bceaca83fb7fef4d2b6c72b79", - "sha256:b529fdfa881b69fe563dbd98acce84f3e5a67df13de415e143ef053ff006d500", - "sha256:b9c77f0d1436ea4b4dc089ed8335fa141e6a251a92f75f675056dac4ab47a71e", - "sha256:bb621ec2dbbbe8df78a27dbd9dd7919f9b7d32a73fafcb4d9252fc4637343582", - "sha256:c7250848ce69559756ad0086a37b82c986cd33c2d344ab87fea596c5ac6d9442", - "sha256:c8d1d14aa0f600b5be363077b621b1b4d1eb3fbf90af83f9281cda668e6ff7fd", - "sha256:d1655a6fc7aecd333b079d00fb3c8132d18988e47f19740c69303bf02e9883c6", - "sha256:d6353ba89cfc657a3f5beabb3b69be226adbb5c6c7a66398e17809b0ce3c4731", - "sha256:da4377904a3379f0c1b75a965fff23b28315bcd516d27f99a803720dfebd94d4", - "sha256:e49ea4c1a9543d2bd8a747ff24411509c29e4bdcde05b5b0895e2120cb1a761d", - "sha256:e4e08305bfd76ba8edab08dcc6496f40674f44eb9d5e23153efa0a35750337e8", - "sha256:e6fa05a680e35d0fcc1470cb070b10e6fe247af54768f488ed93542e71339d6f", - "sha256:e7e6f2d6fd48422071cc8a6f8542016f350b79cc782752de531577d35e9bd677", - "sha256:e904c0381c014b914136c492c8fa711ca4cced4e9b3d110e5e7d436d0fc289e8", - "sha256:ec2b0ab7edc8cd4b0eb428b38ed89079bdc20c6bdb5f889d353011038caac2f9", - "sha256:ef5ce841e102278c1c2e98f043db99d6755b1c58bde475516aef3a008ed7f28e", - "sha256:f351c7d7d92f67c0609329ab2735eee0426a03022771b00102816a72715bb00b", - "sha256:fab7c640815812ed5f10fbee7abbf58788d602046b7bb3af9b1ac753a6d5e916", - "sha256:fc06cc8073c8e87072138ba1e431300e2d408f054b27047d047b549455066ff4" + "sha256:00213676a2e46b6ebf6045bc11d0f529d9120baa6f58d122b4021ad92adabd41", + "sha256:00c870522cdb69cd625b93f002961ffb0c095394f06ba8c48f17eef7c1541f96", + "sha256:0154f7691e4fe6c2b2bc275b5701e8b158dae92a1ab229e2b940efe11905dff4", + "sha256:05a7233089f8bd355e8cbe127c2e8ca0b4ea55467861906b80d2ebc7db4d6b72", + "sha256:09a1814bb15eff7069e51fed0826df0bc0702652b5cb8f87697d469d79c23576", + "sha256:0cff816f51fb33c26d6e2b16b5c7d48eaa31dae5488ace6aae468b361f422b63", + "sha256:185929b4808b36a79c65b7865783b87b6841e852ef5407a2fb0c03381092fa3b", + "sha256:2fc8709c00704194213d45e455adc106ff9e87658297f72d544220e32029cd3d", + "sha256:33d69ca7612f0ddff3316b0c7b33ca180d464ecac2d115805c044bf0a3b0d032", + "sha256:389f8dbb5c489e305fb113ca1b6bdcdaa130923f77485db5b189de343a179393", + "sha256:38ea7b82bfcae927eeffc55d2ffa31665dc7fec7b8dc654506b8e5a518eb4d50", + "sha256:3d3cac3e32b2c8414f4f87c1b2ab686fa6284a980ba283617404377cd448f631", + "sha256:40e826de3085721dabc7cf9bfd41682dadc02286d8cf149b3ad05bff89311e4f", + "sha256:4239b6027e3d66a89446908ff3027d2737afc1a375f8fd3eea630a4842ec9a0c", + "sha256:45ec8e75b7dbc9539cbfafa570742fe4f676eb8b0d3694b67dabe2f2ceed8aa6", + "sha256:47a2964021f2110116cc1125b3e6d87ab5ad16dea161949e7244ec583b905bb4", + "sha256:48c08473563323f9c9debac781ecf66f94ad5a3680a38fe84dee5388cf5acaf6", + "sha256:4c6d2264f485f0b53adf22697ac11e261ce84805c232ed5dbe6b1bcb84b00ff0", + "sha256:4f72e5cd0f18f262f5da20efa9e241699e0cf3a766317a17392550c9ad7b37d8", + "sha256:56029457f219ade1f2fc12a6504ea61e14ee227a815531f9738e41203a429112", + "sha256:5c1289596042fad2cdceb05e1ebf7aadf9995c928e0da2b7a4e99494953b1b94", + "sha256:62e627f6b6d4aed919a2052efc408da7a545c606268d5ab5bfab4432734b82b4", + "sha256:74de2b894b47f1d21cbd0b37a5e2b2392ad95d17ae983e64727e18eb281fe7cb", + "sha256:7c584f366f46ba667cfa66020344886cf47088e79c9b9d39c84ce9ea98aaa331", + "sha256:7d27a7e34c313b3a7f91adcd05134315002aaf8540d7b4f90336beafaea6217c", + "sha256:7d3f0b61c45c3fa9a349cf484962c559a8a1d80dae6977276df8fd1fa5e3cb8c", + "sha256:82ff5e1cae4e855147fd57a2863376ed7454134c2bf49ec604dfe71e446e2193", + "sha256:84bc2a7d075f32f6ed98652db3a680a17a4edb21ca7f80fe42e38753a58ee02b", + "sha256:884be66c76a444c59f801ac13f40c76f176f1bfa815ef5b8ed44321e74f1600b", + "sha256:8a5cc00546e0a701da4639aa0bbcb0ae2bb678c87f46da01ac2d789e1f2d2038", + "sha256:8dc96f64ae43dde92530775e9cb169979f414dcf5cff670455d81a6823b42089", + "sha256:8f38706e0b15d3c20ef6259fd4bc1700cd133b06c3c1bb108ffe3f8947be15fa", + "sha256:90fcf8929836d4a0e964d799a58823547df5a5e9afa83081761630553be731f9", + "sha256:931c039af54fc195fe6ad536fde4b0de04da9d5916e78e55405436348cfb0e56", + "sha256:932af322458da7e4e35df32f050389e13d3d96b09d274b22a7aa1808f292fee4", + "sha256:942de28af58f352a6f588bc72490ae0f4ccd6dfc2bd3de5945b882a078e4e179", + "sha256:9bc42e8402dc5e9905fb8b9649f57efcb2056693b7e88faa8fb029256ba9c68c", + "sha256:a7a240d7a74bf8d5cb3bfe6be7f21697a28ec4b1a437607bae08ac7acf5b4882", + "sha256:a9f9a735deaf9a0cadc2d8c50d1a5bcdbae8b6e539c6e08237bc4082d7c13f28", + "sha256:ae5e95cfb53ab1da62185e23b3130e11d64431179debac6dc3c6acf08760e9b1", + "sha256:b029fb2032ae4724d8ae8d4f6b363f2cc39e4c7b12454df8df7f0f563ed3e61a", + "sha256:b0d15c968ea7a65211e084f523151dbf8ae44634de03c801b8bd070b74e85033", + "sha256:b343f521b047493dc4022dd338fc6db9d9282658862756b4f6fd0e996c1380e1", + "sha256:b627c266f295de9dea86bd1112ed3d5fafb69a348af30a2422e16590a8ecba13", + "sha256:b9968694c5f467bf67ef97ae7ad4d56d14be2751000c1207d31bf3bb8860bae8", + "sha256:ba089c499e1f4155d2a3c2a05d2878a3428cf321c848f2b5a45ce55f0d7d310c", + "sha256:bbccd847aa0c3a69b5f691a84d2341a4f8a629c6922558f2a70611305f902d74", + "sha256:bc0b82d728fe21a0d03e65f81980abbbcb13b5387f733a1a870672c5be26edab", + "sha256:c57e4c1349fbe0e446c9fa7b19ed2f8a4417233b6984277cce392819123142d3", + "sha256:c94ae4faf2d09f7c81847c63843f84fe47bf6253c9d60b20f25edfd30fb12588", + "sha256:c9b27d6c1c6cd53dc93614967e9ce00ae7f864a2d9f99fe5ed86706e1ecbf485", + "sha256:d210abe51b5da0ffdbf7b43eed0cfdff8a55a1ab17abbec4301c9ff077dd0342", + "sha256:d58804e996d7d2307173d56c297cf7bc132c52df27a3efaac5e8d43e36c21c48", + "sha256:d6a4162139374a49eb18ef5b2f4da1dd95c994588f5033d64e0bbfda4b6b6fcf", + "sha256:da39dd03d130162deb63da51f6e66ed73032ae62e74aaccc4236e30edccddbb0", + "sha256:db3c336f9eda2532ec0fd8ea49fef7a8df8f6c804cdf4f39e5c5c0d4a4ad9a7a", + "sha256:dd500e0a5e11969cdd3320935ca2ff1e936f2358f9c2e61f100a1660933320ea", + "sha256:dd9becd5fe29773d140d68d607d66a38f60e31b86df75332703757ee645b6faf", + "sha256:e0cb5cc6ece6ffa75baccfd5c02cffe776f3f5c8bf486811f9d3ea3453676ce8", + "sha256:e23173580d740bf8822fd0379e4bf30aa1d5a92a4f252d34e893070c081050df", + "sha256:e3a686ecb4aa0d64ae60c9c9f1a7d5d46cab9bfb5d91a2d303d00e2cd4c4c5cc", + "sha256:e789376b52c295c4946403bd0efecf27ab98f05319df4583d3c48e43c7342c2f", + "sha256:edc344de4dac1d89300a053ac973299e82d3db56330f3494905643bb68801269", + "sha256:eef610b23933c54d5d921c92578ae5f89813438fded840c2e9809d378dc765d3", + "sha256:f2c38d588887a609191d30e902df2a32711f708abfd85d318ca9b367258cfd0c", + "sha256:f55b5905705725af31ccef50e55391621532cd64fbf0bc6f4bac935f0fccec46", + "sha256:f5fc088b7a32f244c519a048c170f14cf2251b849ef0e20cbbb0fdf0fdaf556f", + "sha256:fe10ddc59b304cb19a1bdf5bd0a7719cbbc9fbdd57ac80ed436b709fcf889106", + "sha256:ff64a1d38d156d429404aaa84b27305e957fd10c30e5880d1765c9480bea490f" ], "markers": "python_version >= '3.7'", - "version": "==10.3" + "version": "==10.4" }, "werkzeug": { "hashes": [ @@ -1226,21 +1248,21 @@ }, "yt-dlp": { "hashes": [ - "sha256:3a7b59d2fb4b39ce8ba8e0b9c5a37fe20e5624f46a2346b4ae66ab1320e35134", - "sha256:deec1009442312c1e2ee5298966842194d0e950b433f0d4fc844ef464b9c32a7" + "sha256:0e7b81fc6ac8d1b7d3fffa79f9044ca4163784422582c9a3593305da2a69ec02", + "sha256:d7d1f81d230756f094b4d9ee59b37b2c13b2e63ff5fb72cda53625edb072cdae" ], "index": "pypi", - "version": "==2022.5.18" + "version": "==2022.7.18" } }, "develop": { "autopep8": { "hashes": [ - "sha256:6f09e90a2be784317e84dc1add17ebfc7abe3924239957a37e5040e27d812087", - "sha256:ca9b1a83e53a7fad65d731dc7a2a2d50aa48f43850407c59f6a1a306c4201142" + "sha256:8b1659c7f003e693199f52caffdc06585bb0716900bbc6a7442fd931d658c077", + "sha256:ad924b42c2e27a1ac58e432166cc4588f5b80747de02d0d35b1ecbd3e7d57207" ], "index": "pypi", - "version": "==1.7.0" + "version": "==2.0.0" }, "pycodestyle": { "hashes": [ @@ -1250,13 +1272,13 @@ "markers": "python_version >= '3.6'", "version": "==2.9.1" }, - "toml": { + "tomli": { "hashes": [ - "sha256:806143ae5bfb6a3c6e736a764057db0e6a0e05e338b5630894a5f779cabb4f9b", - "sha256:b3bda1d108d5dd99f4a20d24d9c348e91c4db7ab1b749200bded2f839ccbe68f" + "sha256:939de3e7a6161af0c887ef91b7d41a53e7c5a1ca976325f429cb46ea9bc30ecc", + "sha256:de526c12914f0c550d15924c62d72abc48d6fe7364aa87328337a31007fe8a4f" ], - "markers": "python_version >= '2.6' and python_version not in '3.0, 3.1, 3.2, 3.3'", - "version": "==0.10.2" + "markers": "python_version >= '3.7'", + "version": "==2.0.1" } } } diff --git a/archivers/base_archiver.py b/archivers/base_archiver.py index 3dc5ba1..7037e4d 100644 --- a/archivers/base_archiver.py +++ b/archivers/base_archiver.py @@ -244,8 +244,13 @@ class Archiver(ABC): filename = os.path.join(browsertrix_home, "collections", collection, f"{collection}.wacz") - self.storage.upload(filename, key, extra_args={ + # do not crash if upload fails + try: + self.storage.upload(filename, key, extra_args={ 'ACL': 'public-read', 'ContentType': 'application/zip'}) + except FileNotFoundError as e: + logger.warning(f"Unable to locate and upload WACZ {filename=}, {key=}") + # clean up the local browsertrix files try: From 81eadd46720e384c0f598fba0f580eaee346a4f1 Mon Sep 17 00:00:00 2001 From: msramalho <19508417+msramalho@users.noreply.github.com> Date: Tue, 8 Nov 2022 14:22:13 +0000 Subject: [PATCH 017/190] disable browsertrix on docker, see #66 --- archivers/base_archiver.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/archivers/base_archiver.py b/archivers/base_archiver.py index 7037e4d..5ef2b7e 100644 --- a/archivers/base_archiver.py +++ b/archivers/base_archiver.py @@ -37,6 +37,7 @@ class Archiver(ABC): self.driver = config.webdriver self.hash_algorithm = config.hash_algorithm self.browsertrix = config.browsertrix_config + self.is_docker = config.is_docker def __str__(self): return self.__class__.__name__ @@ -206,6 +207,11 @@ class Archiver(ABC): if not self.browsertrix.enabled: logger.debug(f"Browsertrix WACZ generation is not enabled, skipping.") return + if self.is_docker: + # TODO: figure out support for browsertrix in docker + # see: https://github.com/bellingcat/auto-archiver/issues/66 + logger.warning(f"Browsertrix WACZ is not yet supported when using DOCKER.") + return logger.debug(f"getting wacz for {url}") key = self._get_key_from_url(url, ".wacz", append_datetime=True) From 390b84eb22393a40b44aaa69ae3b1f8432b752d8 Mon Sep 17 00:00:00 2001 From: msramalho <19508417+msramalho@users.noreply.github.com> Date: Tue, 8 Nov 2022 15:55:33 +0000 Subject: [PATCH 018/190] dockerization complete --- Dockerfile | 7 +++---- src/__init__.py | 0 {archivers => src/archivers}/__init__.py | 0 {archivers => src/archivers}/base_archiver.py | 0 {archivers => src/archivers}/instagram_archiver.py | 0 {archivers => src/archivers}/telegram_archiver.py | 0 {archivers => src/archivers}/telethon_archiver.py | 0 {archivers => src/archivers}/tiktok_archiver.py | 0 {archivers => src/archivers}/twitter_api_archiver.py | 0 {archivers => src/archivers}/twitter_archiver.py | 0 {archivers => src/archivers}/vk_archiver.py | 0 {archivers => src/archivers}/wayback_archiver.py | 0 {archivers => src/archivers}/youtubedl_archiver.py | 0 auto_archive.py => src/auto_archive.py | 0 auto_auto_archive.py => src/auto_auto_archive.py | 0 src/cli.py | 0 {configs => src/configs}/__init__.py | 0 {configs => src/configs}/browsertrix_config.py | 0 {configs => src/configs}/config.py | 0 {configs => src/configs}/instagram_config.py | 0 {configs => src/configs}/selenium_config.py | 0 {configs => src/configs}/telethon_config.py | 0 {configs => src/configs}/twitter_api_config.py | 0 {configs => src/configs}/vk_config.py | 0 {configs => src/configs}/wayback_config.py | 0 {storages => src/storages}/__init__.py | 0 {storages => src/storages}/base_storage.py | 0 {storages => src/storages}/gd_storage.py | 0 {storages => src/storages}/local_storage.py | 0 {storages => src/storages}/s3_storage.py | 0 {utils => src/utils}/__init__.py | 0 {utils => src/utils}/gworksheet.py | 0 {utils => src/utils}/misc.py | 0 33 files changed, 3 insertions(+), 4 deletions(-) create mode 100644 src/__init__.py rename {archivers => src/archivers}/__init__.py (100%) rename {archivers => src/archivers}/base_archiver.py (100%) rename {archivers => src/archivers}/instagram_archiver.py (100%) rename {archivers => src/archivers}/telegram_archiver.py (100%) rename {archivers => src/archivers}/telethon_archiver.py (100%) rename {archivers => src/archivers}/tiktok_archiver.py (100%) rename {archivers => src/archivers}/twitter_api_archiver.py (100%) rename {archivers => src/archivers}/twitter_archiver.py (100%) rename {archivers => src/archivers}/vk_archiver.py (100%) rename {archivers => src/archivers}/wayback_archiver.py (100%) rename {archivers => src/archivers}/youtubedl_archiver.py (100%) rename auto_archive.py => src/auto_archive.py (100%) rename auto_auto_archive.py => src/auto_auto_archive.py (100%) create mode 100644 src/cli.py rename {configs => src/configs}/__init__.py (100%) rename {configs => src/configs}/browsertrix_config.py (100%) rename {configs => src/configs}/config.py (100%) rename {configs => src/configs}/instagram_config.py (100%) rename {configs => src/configs}/selenium_config.py (100%) rename {configs => src/configs}/telethon_config.py (100%) rename {configs => src/configs}/twitter_api_config.py (100%) rename {configs => src/configs}/vk_config.py (100%) rename {configs => src/configs}/wayback_config.py (100%) rename {storages => src/storages}/__init__.py (100%) rename {storages => src/storages}/base_storage.py (100%) rename {storages => src/storages}/gd_storage.py (100%) rename {storages => src/storages}/local_storage.py (100%) rename {storages => src/storages}/s3_storage.py (100%) rename {utils => src/utils}/__init__.py (100%) rename {utils => src/utils}/gworksheet.py (100%) rename {utils => src/utils}/misc.py (100%) diff --git a/Dockerfile b/Dockerfile index a9b4d7a..5db284a 100644 --- a/Dockerfile +++ b/Dockerfile @@ -15,16 +15,15 @@ RUN pip install --upgrade pip && \ # install docker for WACZ -RUN curl -fsSL https://get.docker.com | sh +# TODO: currently disabled see https://github.com/bellingcat/auto-archiver/issues/66 +# RUN curl -fsSL https://get.docker.com | sh # RUN git clone https://github.com/bellingcat/auto-archiver # TODO: avoid copying unnecessary files, including .git -# COPY ./src/ . COPY Pipfile Pipfile.lock ./ RUN pipenv install --python=3.10 --system --deploy -# TODO: to avoid copying pipfile lock it should be on the .dockerignore ENV IS_DOCKER=1 -COPY . . +COPY ./src/ . # CMD ["pipenv", "run", "python", "auto_archive.py"] ENTRYPOINT ["python", "auto_archive.py"] diff --git a/src/__init__.py b/src/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/archivers/__init__.py b/src/archivers/__init__.py similarity index 100% rename from archivers/__init__.py rename to src/archivers/__init__.py diff --git a/archivers/base_archiver.py b/src/archivers/base_archiver.py similarity index 100% rename from archivers/base_archiver.py rename to src/archivers/base_archiver.py diff --git a/archivers/instagram_archiver.py b/src/archivers/instagram_archiver.py similarity index 100% rename from archivers/instagram_archiver.py rename to src/archivers/instagram_archiver.py diff --git a/archivers/telegram_archiver.py b/src/archivers/telegram_archiver.py similarity index 100% rename from archivers/telegram_archiver.py rename to src/archivers/telegram_archiver.py diff --git a/archivers/telethon_archiver.py b/src/archivers/telethon_archiver.py similarity index 100% rename from archivers/telethon_archiver.py rename to src/archivers/telethon_archiver.py diff --git a/archivers/tiktok_archiver.py b/src/archivers/tiktok_archiver.py similarity index 100% rename from archivers/tiktok_archiver.py rename to src/archivers/tiktok_archiver.py diff --git a/archivers/twitter_api_archiver.py b/src/archivers/twitter_api_archiver.py similarity index 100% rename from archivers/twitter_api_archiver.py rename to src/archivers/twitter_api_archiver.py diff --git a/archivers/twitter_archiver.py b/src/archivers/twitter_archiver.py similarity index 100% rename from archivers/twitter_archiver.py rename to src/archivers/twitter_archiver.py diff --git a/archivers/vk_archiver.py b/src/archivers/vk_archiver.py similarity index 100% rename from archivers/vk_archiver.py rename to src/archivers/vk_archiver.py diff --git a/archivers/wayback_archiver.py b/src/archivers/wayback_archiver.py similarity index 100% rename from archivers/wayback_archiver.py rename to src/archivers/wayback_archiver.py diff --git a/archivers/youtubedl_archiver.py b/src/archivers/youtubedl_archiver.py similarity index 100% rename from archivers/youtubedl_archiver.py rename to src/archivers/youtubedl_archiver.py diff --git a/auto_archive.py b/src/auto_archive.py similarity index 100% rename from auto_archive.py rename to src/auto_archive.py diff --git a/auto_auto_archive.py b/src/auto_auto_archive.py similarity index 100% rename from auto_auto_archive.py rename to src/auto_auto_archive.py diff --git a/src/cli.py b/src/cli.py new file mode 100644 index 0000000..e69de29 diff --git a/configs/__init__.py b/src/configs/__init__.py similarity index 100% rename from configs/__init__.py rename to src/configs/__init__.py diff --git a/configs/browsertrix_config.py b/src/configs/browsertrix_config.py similarity index 100% rename from configs/browsertrix_config.py rename to src/configs/browsertrix_config.py diff --git a/configs/config.py b/src/configs/config.py similarity index 100% rename from configs/config.py rename to src/configs/config.py diff --git a/configs/instagram_config.py b/src/configs/instagram_config.py similarity index 100% rename from configs/instagram_config.py rename to src/configs/instagram_config.py diff --git a/configs/selenium_config.py b/src/configs/selenium_config.py similarity index 100% rename from configs/selenium_config.py rename to src/configs/selenium_config.py diff --git a/configs/telethon_config.py b/src/configs/telethon_config.py similarity index 100% rename from configs/telethon_config.py rename to src/configs/telethon_config.py diff --git a/configs/twitter_api_config.py b/src/configs/twitter_api_config.py similarity index 100% rename from configs/twitter_api_config.py rename to src/configs/twitter_api_config.py diff --git a/configs/vk_config.py b/src/configs/vk_config.py similarity index 100% rename from configs/vk_config.py rename to src/configs/vk_config.py diff --git a/configs/wayback_config.py b/src/configs/wayback_config.py similarity index 100% rename from configs/wayback_config.py rename to src/configs/wayback_config.py diff --git a/storages/__init__.py b/src/storages/__init__.py similarity index 100% rename from storages/__init__.py rename to src/storages/__init__.py diff --git a/storages/base_storage.py b/src/storages/base_storage.py similarity index 100% rename from storages/base_storage.py rename to src/storages/base_storage.py diff --git a/storages/gd_storage.py b/src/storages/gd_storage.py similarity index 100% rename from storages/gd_storage.py rename to src/storages/gd_storage.py diff --git a/storages/local_storage.py b/src/storages/local_storage.py similarity index 100% rename from storages/local_storage.py rename to src/storages/local_storage.py diff --git a/storages/s3_storage.py b/src/storages/s3_storage.py similarity index 100% rename from storages/s3_storage.py rename to src/storages/s3_storage.py diff --git a/utils/__init__.py b/src/utils/__init__.py similarity index 100% rename from utils/__init__.py rename to src/utils/__init__.py diff --git a/utils/gworksheet.py b/src/utils/gworksheet.py similarity index 100% rename from utils/gworksheet.py rename to src/utils/gworksheet.py diff --git a/utils/misc.py b/src/utils/misc.py similarity index 100% rename from utils/misc.py rename to src/utils/misc.py From 04263094ad01122620895df1d00d02e57bcdc5f4 Mon Sep 17 00:00:00 2001 From: msramalho <19508417+msramalho@users.noreply.github.com> Date: Thu, 10 Nov 2022 17:46:40 +0000 Subject: [PATCH 019/190] WIP docker changes for cli and auto_archiver --- Dockerfile | 9 +-- src/archivers/base_archiver.py | 40 +++++++++--- src/archivers/instagram_archiver.py | 4 +- src/archivers/telegram_archiver.py | 4 +- src/archivers/telethon_archiver.py | 6 +- src/archivers/tiktok_archiver.py | 10 +-- src/archivers/twitter_api_archiver.py | 4 +- src/archivers/twitter_archiver.py | 6 +- src/archivers/vk_archiver.py | 4 +- src/archivers/wayback_archiver.py | 8 +-- src/archivers/youtubedl_archiver.py | 4 +- src/auto_archive.py | 87 ++++++++++++++------------- src/cli.py | 30 +++++++++ src/configs/config.py | 3 + src/storages/base_storage.py | 9 +++ src/storages/gd_storage.py | 7 --- src/storages/local_storage.py | 7 ++- 17 files changed, 156 insertions(+), 86 deletions(-) diff --git a/Dockerfile b/Dockerfile index 5db284a..96b8405 100644 --- a/Dockerfile +++ b/Dockerfile @@ -18,16 +18,17 @@ RUN pip install --upgrade pip && \ # TODO: currently disabled see https://github.com/bellingcat/auto-archiver/issues/66 # RUN curl -fsSL https://get.docker.com | sh -# RUN git clone https://github.com/bellingcat/auto-archiver # TODO: avoid copying unnecessary files, including .git COPY Pipfile Pipfile.lock ./ RUN pipenv install --python=3.10 --system --deploy ENV IS_DOCKER=1 COPY ./src/ . -# CMD ["pipenv", "run", "python", "auto_archive.py"] -ENTRYPOINT ["python", "auto_archive.py"] +# TODO: figure out how to make volumes not be root, does it depend on host or dockerfile? +# RUN useradd --system --groups sudo --shell /bin/bash archiver && chown -R archiver:sudo . +# USER archiver +ENTRYPOINT ["python"] # ENTRYPOINT ["docker-entrypoint.sh"] -# should be executed with 2 volumes +# should be executed with 2 volumes (3 if local_storage) # docker run -v /var/run/docker.sock:/var/run/docker.sock -v $PWD/secrets:/app/secrets -v $PWD/local_archive:/app/local_archive aa --help \ No newline at end of file diff --git a/src/archivers/base_archiver.py b/src/archivers/base_archiver.py index 5ef2b7e..75395b5 100644 --- a/src/archivers/base_archiver.py +++ b/src/archivers/base_archiver.py @@ -1,8 +1,9 @@ import os, datetime, shutil, hashlib, time, requests, re, mimetypes, subprocess -from dataclasses import dataclass +from dataclasses import dataclass, field from abc import ABC, abstractmethod from urllib.parse import urlparse from random import randrange +from collections import defaultdict import ffmpeg from loguru import logger @@ -27,6 +28,7 @@ class ArchiveResult: screenshot: str = None wacz: str = None hash: str = None + media: list = field(default_factory=list) class Archiver(ABC): name = "default" @@ -38,6 +40,7 @@ class Archiver(ABC): self.hash_algorithm = config.hash_algorithm self.browsertrix = config.browsertrix_config self.is_docker = config.is_docker + self.media = [] def __str__(self): return self.__class__.__name__ @@ -48,13 +51,28 @@ class Archiver(ABC): @abstractmethod def download(self, url, check_if_exists=False): pass + def generateArchiveResult(self, **kwargs): + # remove duplicates + if "cdn_url" in kwargs: + self.add_to_media(kwargs["cdn_url"], None, kwargs.get("hash")) + kwargs["media"] = [dict(t) for t in {tuple(d.items()) for d in self.media}] + return ArchiveResult(**kwargs) + def get_netloc(self, url): return urlparse(url).netloc + def add_to_media(self, cdn_url: str, key: str = None, hash: str = None): + media_info = {"url": cdn_url, "mime": self._guess_file_type(cdn_url) or "misc"} + if key: media_info["key"] = key + if hash: media_info["hash"] = hash + self.media.append(media_info) + def generate_media_page_html(self, url, urls_info: dict, object, thumbnail=None): """ Generates an index.html page where each @urls_info is displayed """ + for ui in urls_info: + self.add_to_media(ui["cdn_url"], ui["key"], ui["hash"]) page = f'''{url}

Archived media from {self.name}

@@ -109,6 +127,8 @@ class Archiver(ABC): For a list of media urls, fetch them, upload them and call self.generate_media_page_html with them """ + for media_url in urls: + self.add_to_media(media_url) thumbnail = None uploaded_media = [] @@ -201,17 +221,20 @@ class Archiver(ABC): self.driver.save_screenshot(filename) self.storage.upload(filename, key, extra_args={'ACL': 'public-read', 'ContentType': 'image/png'}) - return self.storage.get_cdn_url(key) + cdn_url = self.storage.get_cdn_url(key) + self.add_to_media(cdn_url, key) + + return cdn_url def get_wacz(self, url): if not self.browsertrix.enabled: logger.debug(f"Browsertrix WACZ generation is not enabled, skipping.") - return + return if self.is_docker: # TODO: figure out support for browsertrix in docker # see: https://github.com/bellingcat/auto-archiver/issues/66 logger.warning(f"Browsertrix WACZ is not yet supported when using DOCKER.") - return + return logger.debug(f"getting wacz for {url}") key = self._get_key_from_url(url, ".wacz", append_datetime=True) @@ -220,7 +243,7 @@ class Archiver(ABC): browsertrix_home = os.path.join(os.getcwd(), "browsertrix-tmp") cmd = [ "docker", "run", - "--rm", # delete container once it has completed running + "--rm", # delete container once it has completed running "-v", f"{browsertrix_home}:/crawls/", # "-it", # this leads to "the input device is not a TTY" "webrecorder/browsertrix-crawler", "crawl", @@ -253,18 +276,19 @@ class Archiver(ABC): # do not crash if upload fails try: self.storage.upload(filename, key, extra_args={ - 'ACL': 'public-read', 'ContentType': 'application/zip'}) + 'ACL': 'public-read', 'ContentType': 'application/zip'}) except FileNotFoundError as e: logger.warning(f"Unable to locate and upload WACZ {filename=}, {key=}") - # clean up the local browsertrix files try: shutil.rmtree(browsertrix_home) except PermissionError: logger.warn(f"Unable to clean up browsertrix-crawler files in {browsertrix_home}") - return self.storage.get_cdn_url(key) + cdn_url = self.storage.get_cdn_url(key) + self.add_to_media(cdn_url, key) + return cdn_url def get_thumbnails(self, filename, key, duration=None): thumbnails_folder = os.path.splitext(filename)[0] + os.path.sep diff --git a/src/archivers/instagram_archiver.py b/src/archivers/instagram_archiver.py index a2b1147..62db876 100644 --- a/src/archivers/instagram_archiver.py +++ b/src/archivers/instagram_archiver.py @@ -52,7 +52,7 @@ class InstagramArchiver(Archiver): cdn_url = self.storage.get_cdn_url(key) screenshot = self.get_screenshot(url) wacz = self.get_wacz(url) - return ArchiveResult(status='already archived', cdn_url=cdn_url, screenshot=screenshot, wacz=wacz) + return self.generateArchiveResult(status='already archived', cdn_url=cdn_url, screenshot=screenshot, wacz=wacz) try: # process if post @@ -137,4 +137,4 @@ class InstagramArchiver(Archiver): screenshot = self.get_screenshot(url) wacz = self.get_wacz(url) - return ArchiveResult(status=status, cdn_url=page_cdn, title=title, timestamp=date, hash=page_hash, screenshot=screenshot, wacz=wacz) + return self.generateArchiveResult(status=status, cdn_url=page_cdn, title=title, timestamp=date, hash=page_hash, screenshot=screenshot, wacz=wacz) diff --git a/src/archivers/telegram_archiver.py b/src/archivers/telegram_archiver.py index 026bdd0..c6d8747 100644 --- a/src/archivers/telegram_archiver.py +++ b/src/archivers/telegram_archiver.py @@ -47,7 +47,7 @@ class TelegramArchiver(Archiver): time_elements = s.find_all('time') timestamp = time_elements[0].get('datetime') if len(time_elements) else None - return ArchiveResult(status="success", cdn_url=page_cdn, screenshot=screenshot, hash=page_hash, thumbnail=thumbnail, timestamp=timestamp, wacz=wacz) + return self.generateArchiveResult(status="success", cdn_url=page_cdn, screenshot=screenshot, hash=page_hash, thumbnail=thumbnail, timestamp=timestamp, wacz=wacz) video_url = video.get('src') video_id = video_url.split('/')[-1].split('?')[0] @@ -85,5 +85,5 @@ class TelegramArchiver(Archiver): os.remove(filename) cdn_url = self.storage.get_cdn_url(key) - return ArchiveResult(status=status, cdn_url=cdn_url, thumbnail=key_thumb, thumbnail_index=thumb_index, + return self.generateArchiveResult(status=status, cdn_url=cdn_url, thumbnail=key_thumb, thumbnail_index=thumb_index, duration=duration, title=original_url, timestamp=s.find_all('time')[1].get('datetime'), hash=hash, screenshot=screenshot, wacz=wacz) diff --git a/src/archivers/telethon_archiver.py b/src/archivers/telethon_archiver.py index 5c147de..f0ff194 100644 --- a/src/archivers/telethon_archiver.py +++ b/src/archivers/telethon_archiver.py @@ -80,7 +80,7 @@ class TelethonArchiver(Archiver): if check_if_exists and self.storage.exists(key): # only s3 storage supports storage.exists as not implemented on gd cdn_url = self.storage.get_cdn_url(key) - return ArchiveResult(status='already archived', cdn_url=cdn_url, title=post.message, timestamp=post.date, screenshot=screenshot, wacz=wacz) + return self.generateArchiveResult(status='already archived', cdn_url=cdn_url, title=post.message, timestamp=post.date, screenshot=screenshot, wacz=wacz) key_thumb, thumb_index = None, None group_id = post.grouped_id if post.grouped_id is not None else post.id @@ -119,7 +119,7 @@ class TelethonArchiver(Archiver): page_cdn, page_hash, _ = self.generate_media_page_html(url, uploaded_media, html.escape(str(post))) - return ArchiveResult(status=status, cdn_url=page_cdn, title=message, timestamp=post.date, hash=page_hash, screenshot=screenshot, thumbnail=key_thumb, thumbnail_index=thumb_index, wacz=wacz) + return self.generateArchiveResult(status=status, cdn_url=page_cdn, title=message, timestamp=post.date, hash=page_hash, screenshot=screenshot, thumbnail=key_thumb, thumbnail_index=thumb_index, wacz=wacz) page_cdn, page_hash, _ = self.generate_media_page_html(url, [], html.escape(str(post))) - return ArchiveResult(status=status, cdn_url=page_cdn, title=post.message, timestamp=getattr_or(post, "date"), hash=page_hash, screenshot=screenshot, wacz=wacz) + return self.generateArchiveResult(status=status, cdn_url=page_cdn, title=post.message, timestamp=getattr_or(post, "date"), hash=page_hash, screenshot=screenshot, wacz=wacz) diff --git a/src/archivers/tiktok_archiver.py b/src/archivers/tiktok_archiver.py index bdaad52..55cb97e 100644 --- a/src/archivers/tiktok_archiver.py +++ b/src/archivers/tiktok_archiver.py @@ -28,9 +28,9 @@ class TiktokArchiver(Archiver): if len(media) <= 0: if status == 'already archived': - return ArchiveResult(status='Could not download media, but already archived', cdn_url=self.storage.get_cdn_url(key)) + return self.generateArchiveResult(status='Could not download media, but already archived', cdn_url=self.storage.get_cdn_url(key)) else: - return ArchiveResult(status='Could not download media') + return self.generateArchiveResult(status='Could not download media') logger.info(f'downloading video {key=}') media[0].download(filename) @@ -56,17 +56,17 @@ class TiktokArchiver(Archiver): cdn_url = self.storage.get_cdn_url(key) timestamp = info.create.isoformat() if hasattr(info, "create") else None - return ArchiveResult(status=status, cdn_url=cdn_url, thumbnail=key_thumb, + return self.generateArchiveResult(status=status, cdn_url=cdn_url, thumbnail=key_thumb, thumbnail_index=thumb_index, duration=getattr(info, "duration", 0), title=getattr(info, "caption", ""), timestamp=timestamp, hash=hash, screenshot=screenshot, wacz=wacz) except tiktok_downloader.Except.InvalidUrl as e: status = 'Invalid URL' logger.warning(f'Invalid URL on {url} {e}\n{traceback.format_exc()}') - return ArchiveResult(status=status) + return self.generateArchiveResult(status=status) except: error = traceback.format_exc() status = 'Other Tiktok error: ' + str(error) logger.warning(f'Other Tiktok error' + str(error)) - return ArchiveResult(status=status) + return self.generateArchiveResult(status=status) diff --git a/src/archivers/twitter_api_archiver.py b/src/archivers/twitter_api_archiver.py index 454cfe2..da56d31 100644 --- a/src/archivers/twitter_api_archiver.py +++ b/src/archivers/twitter_api_archiver.py @@ -40,7 +40,7 @@ class TwitterApiArchiver(TwitterArchiver): # only s3 storage supports storage.exists as not implemented on gd cdn_url = self.storage.get_cdn_url(key) screenshot = self.get_screenshot(url) - return ArchiveResult(status='already archived', cdn_url=cdn_url, title=tweet.data.text, timestamp=timestamp, screenshot=screenshot) + return self.generateArchiveResult(status='already archived', cdn_url=cdn_url, title=tweet.data.text, timestamp=timestamp, screenshot=screenshot) urls = [] if tweet.includes: @@ -72,4 +72,4 @@ class TwitterApiArchiver(TwitterArchiver): screenshot = self.get_screenshot(url) wacz = self.get_wacz(url) page_cdn, page_hash, thumbnail = self.generate_media_page(urls, url, output) - return ArchiveResult(status="success", cdn_url=page_cdn, screenshot=screenshot, hash=page_hash, thumbnail=thumbnail, timestamp=timestamp, title=tweet.data.text, wacz=wacz) + return self.generateArchiveResult(status="success", cdn_url=page_cdn, screenshot=screenshot, hash=page_hash, thumbnail=thumbnail, timestamp=timestamp, title=tweet.data.text, wacz=wacz) diff --git a/src/archivers/twitter_archiver.py b/src/archivers/twitter_archiver.py index b868af5..f1f22c0 100644 --- a/src/archivers/twitter_archiver.py +++ b/src/archivers/twitter_archiver.py @@ -41,7 +41,7 @@ class TwitterArchiver(Archiver): screenshot = self.get_screenshot(url) wacz = self.get_wacz(url) page_cdn, page_hash, _ = self.generate_media_page_html(url, [], html.escape(tweet.json())) - return ArchiveResult(status="success", cdn_url=page_cdn, title=tweet.content, timestamp=tweet.date, hash=page_hash, screenshot=screenshot, wacz=wacz) + return self.generateArchiveResult(status="success", cdn_url=page_cdn, title=tweet.content, timestamp=tweet.date, hash=page_hash, screenshot=screenshot, wacz=wacz) urls = [] @@ -62,7 +62,7 @@ class TwitterArchiver(Archiver): screenshot = self.get_screenshot(url) wacz = self.get_wacz(url) - return ArchiveResult(status="success", cdn_url=page_cdn, screenshot=screenshot, hash=page_hash, thumbnail=thumbnail, timestamp=tweet.date, title=tweet.content, wacz=wacz) + return self.generateArchiveResult(status="success", cdn_url=page_cdn, screenshot=screenshot, hash=page_hash, thumbnail=thumbnail, timestamp=tweet.date, title=tweet.content, wacz=wacz) def download_alternative(self, url, tweet_id): # https://stackoverflow.com/a/71867055/6196010 @@ -87,7 +87,7 @@ class TwitterArchiver(Archiver): screenshot = self.get_screenshot(url) wacz = self.get_wacz(url) page_cdn, page_hash, thumbnail = self.generate_media_page(urls, url, r.text) - return ArchiveResult(status="success", cdn_url=page_cdn, screenshot=screenshot, hash=page_hash, thumbnail=thumbnail, timestamp=timestamp, title=tweet["text"], wacz=wacz) + return self.generateArchiveResult(status="success", cdn_url=page_cdn, screenshot=screenshot, hash=page_hash, thumbnail=thumbnail, timestamp=timestamp, title=tweet["text"], wacz=wacz) def choose_variant(self, variants): # choosing the highest quality possible diff --git a/src/archivers/vk_archiver.py b/src/archivers/vk_archiver.py index 91b8354..1d38fa9 100644 --- a/src/archivers/vk_archiver.py +++ b/src/archivers/vk_archiver.py @@ -31,7 +31,7 @@ class VkArchiver(Archiver): # if check_if_exists and self.storage.exists(key): # screenshot = self.get_screenshot(url) # cdn_url = self.storage.get_cdn_url(key) - # return ArchiveResult(status="already archived", cdn_url=cdn_url, screenshot=screenshot) + # return self.generateArchiveResult(status="already archived", cdn_url=cdn_url, screenshot=screenshot) results = self.vks.scrape(url) # some urls can contain multiple wall/photo/... parts and all will be fetched if len(results) == 0: @@ -71,4 +71,4 @@ class VkArchiver(Archiver): # # if multiple wall/photos/videos are present the screenshot will only grab the 1st screenshot = self.get_screenshot(url) wacz = self.get_wacz(url) - return ArchiveResult(status="success", cdn_url=page_cdn, screenshot=screenshot, hash=page_hash, thumbnail=thumbnail, thumbnail_index=thumbnail_index, timestamp=datetime, title=title, wacz=wacz) + return self.generateArchiveResult(status="success", cdn_url=page_cdn, screenshot=screenshot, hash=page_hash, thumbnail=thumbnail, thumbnail_index=thumbnail_index, timestamp=datetime, title=title, wacz=wacz) diff --git a/src/archivers/wayback_archiver.py b/src/archivers/wayback_archiver.py index e0ede90..1bfa78a 100644 --- a/src/archivers/wayback_archiver.py +++ b/src/archivers/wayback_archiver.py @@ -39,7 +39,7 @@ class WaybackArchiver(Archiver): if r.status_code != 200: logger.warning(f"Internet archive failed with status of {r.status_code}") - return ArchiveResult(status="Internet archive failed", screenshot=screenshot, wacz=wacz) + return self.generateArchiveResult(status="Internet archive failed", screenshot=screenshot, wacz=wacz) if 'job_id' not in r.json() and 'message' in r.json(): return self.custom_retry(r.json(), screenshot=screenshot, wacz=wacz) @@ -61,7 +61,7 @@ class WaybackArchiver(Archiver): retries += 1 if status_r.status_code != 200: - return ArchiveResult(status=f"Internet archive failed: check https://web.archive.org/save/status/{job_id}", screenshot=screenshot, wacz=wacz) + return self.generateArchiveResult(status=f"Internet archive failed: check https://web.archive.org/save/status/{job_id}", screenshot=screenshot, wacz=wacz) status_json = status_r.json() if status_json['status'] != 'success': @@ -77,7 +77,7 @@ class WaybackArchiver(Archiver): title = 'Could not get title' except: title = "Could not get title" - self.seen_urls[url] = ArchiveResult(status='success', cdn_url=archive_url, title=title, screenshot=screenshot, wacz=wacz) + self.seen_urls[url] = self.generateArchiveResult(status='success', cdn_url=archive_url, title=title, screenshot=screenshot, wacz=wacz) return self.seen_urls[url] def custom_retry(self, json_data, **kwargs): @@ -86,4 +86,4 @@ class WaybackArchiver(Archiver): return self.signal_retry_in(**kwargs) if "this host has been already captured" in str(json_data).lower(): return self.signal_retry_in(**kwargs, min_seconds=86400, max_seconds=129600) # 24h to 36h later - return ArchiveResult(status=f"Internet archive failed: {json_data}", **kwargs) + return self.generateArchiveResult(status=f"Internet archive failed: {json_data}", **kwargs) diff --git a/src/archivers/youtubedl_archiver.py b/src/archivers/youtubedl_archiver.py index 5d09442..e2f27a2 100644 --- a/src/archivers/youtubedl_archiver.py +++ b/src/archivers/youtubedl_archiver.py @@ -38,7 +38,7 @@ class YoutubeDLArchiver(Archiver): if info.get('is_live', False): logger.warning("Live streaming media, not archiving now") - return ArchiveResult(status="Streaming media") + return self.generateArchiveResult(status="Streaming media") if 'twitter.com' in netloc: if 'https://twitter.com/' in info['webpage_url']: @@ -114,5 +114,5 @@ class YoutubeDLArchiver(Archiver): elif 'upload_date' in info and info['upload_date'] is not None: timestamp = datetime.datetime.strptime(info['upload_date'], '%Y%m%d').replace(tzinfo=datetime.timezone.utc) - return ArchiveResult(status=status, cdn_url=cdn_url, thumbnail=key_thumb, thumbnail_index=thumb_index, duration=duration, + return self.generateArchiveResult(status=status, cdn_url=cdn_url, thumbnail=key_thumb, thumbnail_index=thumb_index, duration=duration, title=info['title'] if 'title' in info else None, timestamp=timestamp, hash=hash, screenshot=screenshot, wacz=wacz) diff --git a/src/auto_archive.py b/src/auto_archive.py index 3412b0a..a797405 100644 --- a/src/auto_archive.py +++ b/src/auto_archive.py @@ -57,7 +57,7 @@ def missing_required_columns(gw: GWorksheet): return missing -def should_process_sheet(c, sheet_name): +def should_process_sheet(c: Config, sheet_name): if len(c.worksheet_allow) and sheet_name not in c.worksheet_allow: # ALLOW rules exist AND sheet name not explicitly allowed return False @@ -67,6 +67,50 @@ def should_process_sheet(c, sheet_name): return True +def archive_url(c: Config, url: str, folder: str, debug_string: str, is_retry: bool): + url = expand_url(url) + c.set_folder(folder) + storage = c.get_storage() + + # make a new driver so each spreadsheet row is idempotent + c.recreate_webdriver() + + # order matters, first to succeed excludes remaining + active_archivers = [ + TelethonArchiver(storage, c), + TiktokArchiver(storage, c), + TwitterApiArchiver(storage, c), + InstagramArchiver(storage, c), + YoutubeDLArchiver(storage, c), + TelegramArchiver(storage, c), + TwitterArchiver(storage, c), + VkArchiver(storage, c), + WaybackArchiver(storage, c) + ] + + for archiver in active_archivers: + logger.debug(f'Trying {archiver} on {debug_string}') + + try: + result = archiver.download(url, check_if_exists=c.check_if_exists) + except KeyboardInterrupt as e: raise e # so the higher level catch can catch it + except Exception as e: + result = False + logger.error(f'Got unexpected error in {debug_string} with {archiver.name} for {url=}: {e}\n{traceback.format_exc()}') + + if result: + success = result.status in ['success', 'already archived'] + result.status = f"{archiver.name}: {result.status}" + if success: + logger.success(f'{archiver.name} succeeded on {debug_string}, {url=}') + break + # only 1 retry possible for now + if is_retry and Archiver.is_retry(result.status): + result.status = Archiver.remove_retry(result.status) + logger.warning(f'{archiver.name} did not succeed on {debug_string}, final status: {result.status}') + return result + + def process_sheet(c: Config): sh = c.gsheets_client.open(c.sheet) @@ -100,46 +144,7 @@ def process_sheet(c: Config): # All checks done - archival process starts here try: gw.set_cell(row, 'status', 'Archive in progress') - url = expand_url(url) - c.set_folder(gw.get_cell_or_default(row, 'folder', default_folder, when_empty_use_default=True)) - - # make a new driver so each spreadsheet row is idempotent - c.recreate_webdriver() - - # order matters, first to succeed excludes remaining - active_archivers = [ - TelethonArchiver(storage, c), - TiktokArchiver(storage, c), - TwitterApiArchiver(storage, c), - InstagramArchiver(storage, c), - YoutubeDLArchiver(storage, c), - TelegramArchiver(storage, c), - TwitterArchiver(storage, c), - VkArchiver(storage, c), - WaybackArchiver(storage, c) - ] - - for archiver in active_archivers: - logger.debug(f'Trying {archiver} on {row=}') - - try: - result = archiver.download(url, check_if_exists=c.check_if_exists) - except KeyboardInterrupt as e: raise e # so the higher level catch can catch it - except Exception as e: - result = False - logger.error(f'Got unexpected error in row {row} with {archiver.name} for {url=}: {e}\n{traceback.format_exc()}') - - if result: - success = result.status in ['success', 'already archived'] - result.status = f"{archiver.name}: {result.status}" - if success: - logger.success(f'{archiver.name} succeeded on {row=}, {url=}') - break - # only 1 retry possible for now - if is_retry and Archiver.is_retry(result.status): - result.status = Archiver.remove_retry(result.status) - logger.warning(f'{archiver.name} did not succeed on {row=}, final status: {result.status}') - + result = archive_url(c, url, gw.get_cell_or_default(row, 'folder', default_folder, when_empty_use_default=True), f"{row=}", is_retry=is_retry) if result: update_sheet(gw, row, url, result) else: diff --git a/src/cli.py b/src/cli.py index e69de29..b6d2b70 100644 --- a/src/cli.py +++ b/src/cli.py @@ -0,0 +1,30 @@ +import tempfile, json +import auto_archive +from loguru import logger +from configs import Config +from storages import Storage +from slugify import slugify + + +def main(): + c = Config() + c.parse() + url = c.url + if not url: + logger.error("Invalid URL: '{url}'") + return + logger.info(f'Archiving "{url=}".') + with tempfile.TemporaryDirectory(dir="./") as tmpdir: + Storage.TMP_FOLDER = tmpdir + result = auto_archive.archive_url(c, url, "", f"{url=}", False) + c.destroy_webdriver() + key = f"media_{slugify(url)}.json" + with open(key, "w", encoding="utf-8") as outf: + json.dump(result.media, outf, ensure_ascii=False, indent=4) + c.get_storage().upload(key, key) + print(result) + return result + + +if __name__ == "__main__": + main() diff --git a/src/configs/config.py b/src/configs/config.py index 01b8173..bbd385e 100644 --- a/src/configs/config.py +++ b/src/configs/config.py @@ -47,6 +47,8 @@ class Config: with open(self.config_file, "r", encoding="utf-8") as inf: self.config = yaml.safe_load(inf) + self.url = getattr_or(self.args, "url", '') + # ----------------------EXECUTION - execution configurations execution = self.config.get("execution", {}) @@ -211,6 +213,7 @@ class Config: """ parser = argparse.ArgumentParser(description='Automatically archive social media posts, videos, and images from a Google Sheets document. The command line arguments will always override the configurations in the provided YAML config file (--config), only some high-level options are allowed via the command line and the YAML configuration file is the preferred method. The sheet must have the "url" and "status" for the archiver to work. ') + parser.add_argument('--url', action='store', dest='url', help='single URL to archive - to use only via cli.py and not google sheets interaction') parser.add_argument('--config', action='store', dest='config', help='the filename of the YAML configuration file (defaults to \'config.yaml\')', default='config.yaml') parser.add_argument('--storage', action='store', dest='storage', help='which storage to use [execution.storage in config.yaml]', choices=Config.AVAILABLE_STORAGES) parser.add_argument('--sheet', action='store', dest='sheet', help='the name of the google sheets document [execution.sheet in config.yaml]') diff --git a/src/storages/base_storage.py b/src/storages/base_storage.py index cde00fe..f147678 100644 --- a/src/storages/base_storage.py +++ b/src/storages/base_storage.py @@ -1,3 +1,4 @@ +import os, uuid from loguru import logger from abc import ABC, abstractmethod from pathlib import Path @@ -18,6 +19,14 @@ class Storage(ABC): @abstractmethod def uploadf(self, file, key, **kwargs): pass + def clean_key(self, key): + # Some storages does not work well with trailing forward slashes and some keys come with that + if key.startswith('/'): + logger.debug(f'Found and fixed a leading "/" for {key=}') + return key[1:] + return key + + def upload(self, filename: str, key: str, **kwargs): logger.debug(f'[{self.__class__.__name__}] uploading file {filename} with key {key}') with open(filename, 'rb') as f: diff --git a/src/storages/gd_storage.py b/src/storages/gd_storage.py index 5f3bbeb..3af77f1 100644 --- a/src/storages/gd_storage.py +++ b/src/storages/gd_storage.py @@ -116,13 +116,6 @@ class GDStorage(Storage): # GD only requires the filename not a file reader self.uploadf(filename, key, **kwargs) - def clean_key(self, key): - # GDrive does not work well with trailing forward slashes and some keys come with that - if key.startswith('/'): - logger.debug(f'Found and fixed a leading "/" for {key=}') - return key[1:] - return key - # gets the Drive folderID if it is there def _get_id_from_parent_and_name(self, parent_id: str, name: str, retries: int = 1, sleep_seconds: int = 10, use_mime_type: bool = False, raise_on_missing: bool = True, use_cache=False): """ diff --git a/src/storages/local_storage.py b/src/storages/local_storage.py index ca328e0..1109767 100644 --- a/src/storages/local_storage.py +++ b/src/storages/local_storage.py @@ -1,6 +1,7 @@ import os from dataclasses import dataclass +from loguru import logger from .base_storage import Storage from utils import mkdir_if_not_exists @@ -18,8 +19,12 @@ class LocalStorage(Storage): mkdir_if_not_exists(self.save_to) def get_cdn_url(self, key): + key = self.clean_key(key) + logger.info(f"{key=}") full_path = os.path.join(self.save_to, self.folder, key) - mkdir_if_not_exists(os.path.join(*full_path.split(os.path.sep)[0:-1])) + logger.debug(f"{full_path=} creating dir structure to {os.path.dirname(full_path)}") + os.makedirs(os.path.dirname(full_path), exist_ok=True) + # mkdir_if_not_exists(os.path.join(*full_path.split(os.path.sep)[0:-1])) return os.path.abspath(full_path) def exists(self, key): From 6a0ce5ced18d94bd9a7454cfe77f079ff80313f8 Mon Sep 17 00:00:00 2001 From: msramalho <19508417+msramalho@users.noreply.github.com> Date: Fri, 11 Nov 2022 02:08:48 +0000 Subject: [PATCH 020/190] orchestrator design structure --- orchestrate.yaml | 48 ++++++++++ src/orchestrator.py | 215 ++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 263 insertions(+) create mode 100644 orchestrate.yaml create mode 100644 src/orchestrator.py diff --git a/orchestrate.yaml b/orchestrate.yaml new file mode 100644 index 0000000..9a4ec42 --- /dev/null +++ b/orchestrate.yaml @@ -0,0 +1,48 @@ +steps: + # only 1 feeder allowed + # a feeder could be in an "infinite loop" for example: gsheets_infinite feeder which holds-> this could be an easy logic addiction by modifying for each to while not feeder.done() if it becomes necessary + feeder: gsheets_feeder # default -> only expects URL from CLI + archivers: # order matters + - tiktok + - telethon + - twitter + - instagram + - webarchive # this way it runs as a failsafe only + enrichments: + - screenshot + - wacz + - webarchive # this way it runs for every case, webarchive extends archiver and enrichment + - thumbnails + formatters: + - HTMLFormater + - PDFFormater + storages: + - local_storage + - s3 + databases: + - gsheets_db + - mongo_db + + + +configurations: + gsheets_feeder: + - sheet: "Auto archiver" + - header: "" # defaults to 1 in GSheetsFeeder + - service_account: "secrets/service_account.json" + tiktok: + username: "abc" + password: "123" + token: "here" + screenshot: + width: 1280 + height: 720 + wacz: + profile: secrets/profile.tar.gz + webarchive: + api_key: "12345" + s3: + - bucket: 123 + - region: "nyc3" + - cdn: "{region}{bucket}" + diff --git a/src/orchestrator.py b/src/orchestrator.py new file mode 100644 index 0000000..30f7b3c --- /dev/null +++ b/src/orchestrator.py @@ -0,0 +1,215 @@ +from typing import Union, Dict +from __future__ import annotations +from dataclasses import dataclass + +""" +how not to couple the different pieces of logic +due to the use of constants for the metadata keys? +perhaps having methods on the Metadata level that can be used to fetch a limited number of +keys, never using strings but rather methods? +eg: m = Metadata() + m.get("screenshot") vs m.get_all() + m.get_url() + m.get_hash() + m.get_main_file().get_title() + m.get_screenshot() # this method should only exist because of the Screenshot Enricher + # maybe there is a way for Archivers and Enrichers and Storages to add their own methdods + # which raises still the Q of how the database, eg., knows they exist? + # maybe there's a function to fetch them all, and each Database can register wathever they get + # for eg the GoogleSheets will only register based on the available column names, it knows what it wants + # and if it's there: great, otherwise business as usual. + # and a MongoDatabase could register all data, for example. + # +How are Orchestrators created? from a configuration file? + orchestrator = ArchivingOrchestrator(config) + # Config contains 1 URL, or URLs, from the command line + # OR a feeder which is described in the config file + # config.get_feeder() # if called as docker run --url "http...." then the uses the default filter + # if config.yaml says config + orchestrator.start() + + +Example applications: +1. auto-archiver for GSheets +2. archiver for URL: feeder is CLIFeeder(config.cli.urls="") # --urls="u1,u2" +3. archiver backend for a UI that implements a REST API, the API calls CLI + +Cisticola considerations: +1. By isolating the archiving logic into "Archiving only pieces of logic" these could simply call cisticola.tiktok_scraper(user, pass) +2. So the auto-archiver becomes like a puzzle and fixes to Cisticola scrapers can immediately benefit it, and contributions are focused on a single source or scraping +""" + +@dataclass +class Metadata: + # does not handle files, only primitives + # the only piece of logic to handle files is the archiver, enricher, and storage + status: str + # title: str + # url: str + # hash: str + main_file: Metadata + metadata: Dict[str, Metadata] + + @staticmethod + def merge(left, right : Metadata, overwrite_left=True) -> Metadata: + # should return a merged version of the Metadata + # will work for archived() and enriched() + # what if 2 metadatas contain the same keys? only one can remain! : overwrite_left + pass + + def get(self, key) -> Union[Metadata, str]: + # goes through metadata and returns the Metadata available + pass + + def as_json(self) -> str: + # converts all metadata and data into JSON + pass + + +""" +@dataclass +class ArchiveResult: + # maybe metadata can have status as well, eg: screenshot fails. should that be registered in the databases? likely yes + status: str + url: str + metadata: Metadata + # title, url, hash, other={} + # cdn_url: str = None + # thumbnail: str = None + # thumbnail_index: str = None + # duration: float = None + # title: str = None + # timestamp: datetime.datetime = None + # screenshot: str = None + # wacz: str = None + # hash: str = None + # media: list = field(default_factory=list) + + def __init__(self) -> None: pass + + def update(self, metadata) -> None: + # receive a Metadata instance and update itself with it! + pass + + def as_json(self) -> str: + # converts all metadata and data into JSON + pass +""" + +""" +There is a Superclass for: + * Database (should_process) + +How can GSheets work? it needs to feed from a READER (GSheets Feeder) + +Once an archiver returns a link to a local file (for eg to a storage), how do we then delete the produced local files? +The context metadata should include a temporary folder (maybe a LocalStorage instance?) +""" + +class ArchivingOrchestrator: + def __init__(self, config) -> None: + # in config.py we should test that the archivers exist and log mismatches (blocking execution) + # identify each formatter, storage, database, etc + self.feeder = Feeder.init(config.feeder, config.get(config.feeder)) + + # Is it possible to overwrite config.yaml values? it could be useful: share config file and modify gsheets_feeder.sheet via CLI + # where does that update/processing happen? in config.py + # reflection for Archiver to know wihch child classes it has? use Archiver.__subclasses__ + self.archivers = [ + Archiver.init(a, config.get(a)) + for a in config.archivers + ] + + self.enrichments = [ + Enrichment.init(e, config.get(e)) + for e in config.enrichments + ] + + self.formatters = [ + Formatter.init(f, config.get(f)) + for f in config.formatters + ] + + self.storages = [ + Storage.init(s, config.get(s)) + for s in config.storages + ] + + self.databases = [ + Database.init(f, config.get(f)) + for f in config.formatters + ] + + # these rules are checked in config.py + assert len(archivers) > 1, "there needs to be at least one Archiver" + + def feed(self, feeder: Feeder) -> list(ArchiveResult): + for next in feeder: + self.archive(next) + # how does this handle the parameters like folder which can be different for each archiver? + # the storage needs to know where to archive!! + # solution: feeders have context: extra metadata that they can read or ignore, + # all of it should have sensible defaults (eg: folder) + # default feeder is a list with 1 element + + def archive(url) -> Union[ArchiveResult, None]: + url = clear_url(url) + result = Metadata(url=url) + + + should_archive = True + for d in databases: should_archive &= d.should_process(url) + # should storages also be able to check? + for s in storages: should_archive &= s.should_process(url) + + if not should_archive: + return "skipping" + + # signal to DB that archiving has started + for d in databases: + # are the databases to decide whether to archive? + # they can simply return True by default, otherwise they can avoid duplicates. should this logic be more granular, for example on the archiver level: a tweet will not need be scraped twice, whereas an instagram profile might. the archiver could not decide from the link which parts to archive, + # instagram profile example: it would always re-archive everything + # maybe the database/storage could use a hash/key to decide if there's a need to re-archive + if d.should_process(url): + d.started(url) + elif d.exists(url): + return d.fetch(url) + else: + print("Skipping url") + return + + # vk, telethon, ... + for a in archivers: + # with automatic try/catch in download + archived (+ the other ops below) + # should the archivers come with the config already? are there configs which change at runtime? + # think not, so no need to pass config as parameter + # do they need to be refreshed with every execution? + # this is where the Hashes come from, the place with access to all content + # the archiver does not have access to storage + result.update(a.download(url)) + if result.is_success(): break + + # what if an archiver returns multiple entries and one is to be part of HTMLgenerator? + # should it call the HTMLgenerator as if it's not an enrichment? + # eg: if it is enable: generates an HTML with all the returned media, should it include enrichments? yes + # then how to execute it last? should there also be post-processors? are there other examples? + # maybe as a PDF? or a Markdown file + # side captures: screenshot, wacz, webarchive, thumbnails, HTMLgenerator + for e in enrichments: + result.update(e.enrich(result)) + + # formatters, enrichers, and storages will sometimes look for specific properties: eg
  • Screenshot:
  • + for p in formatter: + result.update(p.process(result)) + + # storages + for s in storages: + for m in result.media: + m.update(s.store(m)) + + # signal completion to databases (DBs, Google Sheets, CSV, ...) + # a hash registration service could be one database: forensic archiving + for d in databases: d.done( result) + + return result \ No newline at end of file From 65dd155c9047041f468a3a4e44866fae858d6cb2 Mon Sep 17 00:00:00 2001 From: msramalho <19508417+msramalho@users.noreply.github.com> Date: Tue, 15 Nov 2022 15:00:52 +0000 Subject: [PATCH 021/190] WIP refactor logic --- README.md | 40 ++++++++++++-- orchestrate.yaml | 8 +-- src/configs/v2config.py | 80 ++++++++++++++++++++++++++++ src/enrichers/__init__.py | 2 + src/enrichers/enricher.py | 20 +++++++ src/enrichers/enricher_screenshot.py | 53 ++++++++++++++++++ src/metadata.py | 30 +++++++++++ src/orchestrator.py | 64 +++++++++++----------- src/step.py | 30 +++++++++++ src/utils/__init__.py | 3 +- src/utils/util.py | 20 +++++++ src/v2.py | 9 ++++ 12 files changed, 320 insertions(+), 39 deletions(-) create mode 100644 src/configs/v2config.py create mode 100644 src/enrichers/__init__.py create mode 100644 src/enrichers/enricher.py create mode 100644 src/enrichers/enricher_screenshot.py create mode 100644 src/metadata.py create mode 100644 src/step.py create mode 100644 src/utils/util.py create mode 100644 src/v2.py diff --git a/README.md b/README.md index 8bdc7d5..11ff002 100644 --- a/README.md +++ b/README.md @@ -4,15 +4,40 @@ Read the [article about Auto Archiver on bellingcat.com](https://www.bellingcat. Python script to automatically archive social media posts, videos, and images from a Google Sheets document. Uses different archivers depending on the platform, and can save content to local storage, S3 bucket (Digital Ocean Spaces, AWS, ...), and Google Drive. The Google Sheets where the links come from is updated with information about the archived content. It can be run manually or on an automated basis. + +# Requirement configurations +# Running with docker +# Running without docker + + + +### Setup checklist +Use this to make sure you help making sure you did all the required steps: +* [ ] you have a `/secrets` folder with all your configuration files including + * [ ] a configuration file eg: `config.yaml` pointing to the correct location of other files + * [ ] you have a `service_account.json` + * [ ] (optional for telegram) a `anon.session` which appears after the 1st run to avoid logging into the + * [ ] (optional for VK) a `vk_config.v2.json` + * [ ] (optional for using GoogleDrive storage) `gd-token.json` + * [ ] (optional for instagram) `instaloader.session` file which appears after the 1st run and login in telegram + * [ ] (optional for browsertrix) `profile.tar.gz` file + ## Setup +### Always required +1. [A Google Service account is necessary for use with `gspread`.](https://gspread.readthedocs.io/en/latest/oauth2.html#for-bots-using-service-account) Credentials for this account should be stored in `service_account.json`, in the same directory as the script. +2. A configuration file, see [Configuration file](#configuration-file). -Check this [tutorial video](https://youtu.be/VfAhcuV2tLQ). +### With docker image +[Docker](https://www.docker.com/) is like a virtual machine program that isolates all the installation dependencies needed for the auto-archiver and it should be the only thing you need to install. + +### Without docker +Check this [tutorial video](https://youtu.be/VfAhcuV2tLQ) for setup without the docker image. If you are using `pipenv` (recommended), `pipenv install` is sufficient to install Python prerequisites. -You also need: +You need to install the following requirements on your machine: 1. [A Google Service account is necessary for use with `gspread`.](https://gspread.readthedocs.io/en/latest/oauth2.html#for-bots-using-service-account) Credentials for this account should be stored in `service_account.json`, in the same directory as the script. 2. [ffmpeg](https://www.ffmpeg.org/) must also be installed locally for this tool to work. 3. [firefox](https://www.mozilla.org/en-US/firefox/new/) and [geckodriver](https://github.com/mozilla/geckodriver/releases) on a path folder like `/usr/local/bin`. @@ -22,7 +47,7 @@ You also need: 1. To improve the websites browsertrix can archive you can also create a custom profile by running `docker run -p 9222:9222 -p 9223:9223 -v $PWD/browsertrix/crawls/profiles:/crawls/profiles/ -it webrecorder/browsertrix-crawler create-login-profile --interactive --url "https://youtube.com"`, going to [http://localhost:9223/](http://localhost:9223/) and accepting the cookies prompt on youtube, and then navigating to other websites and logging in as per your needs, so as to access more publicly blocked content, and then specifying the created `profile.tar.gz` in your config file under `execution.browsertrix.profile`. ### Configuration file -Configuration is done via a config.yaml file (see [example.config.yaml](example.config.yaml)) and some properties of that file can be overwritten via command line arguments. Here is the current result from running the `python auto_archive.py --help`: +Configuration is done via a config.yaml file (see [example.config.yaml](example.config.yaml)) and some properties of that file can be overwritten via command line arguments. Make a copy of that file and rename it to your liking eg. `config-test.yaml` . Here is the current result from running the `python auto_archive.py --help`:
    python auto_archive.py --help @@ -151,6 +176,15 @@ To make it easier to set up new auto-archiver sheets, the auto-auto-archiver wil ![A screenshot of a Google Spreadsheet configured to show instructional text and a list of sheet names to check with auto-archiver.](docs/auto-auto.png) +# Docker development +* working with docker locally: + * `docker build . -t auto-archiver` to build a local image + * `docker run --rm -v $PWD/secrets:/app/secrets aa --config secrets/config.yaml` + * to use local archive, also create a volume `-v` for it by adding `-v $PWD/local_archive:/app/local_archive` +* release to docker hub + * `docker image tag auto-archiver bellingcat/auto-archiver:latest` + * `docker push bellingcat/auto-archiver` (validate [here]()) + # Code structure Code is split into functional concepts: 1. [Archivers](archivers/) - receive a URL that they try to archive diff --git a/orchestrate.yaml b/orchestrate.yaml index 9a4ec42..689765f 100644 --- a/orchestrate.yaml +++ b/orchestrate.yaml @@ -8,14 +8,14 @@ steps: - twitter - instagram - webarchive # this way it runs as a failsafe only - enrichments: + enrichers: - screenshot - wacz - webarchive # this way it runs for every case, webarchive extends archiver and enrichment - thumbnails formatters: - HTMLFormater - - PDFFormater + - PdfFormater storages: - local_storage - s3 @@ -26,6 +26,8 @@ steps: configurations: + global: + - save_logs: False gsheets_feeder: - sheet: "Auto archiver" - header: "" # defaults to 1 in GSheetsFeeder @@ -36,7 +38,7 @@ configurations: token: "here" screenshot: width: 1280 - height: 720 + height: 720000 wacz: profile: secrets/profile.tar.gz webarchive: diff --git a/src/configs/v2config.py b/src/configs/v2config.py new file mode 100644 index 0000000..4b072c3 --- /dev/null +++ b/src/configs/v2config.py @@ -0,0 +1,80 @@ + + +import argparse, yaml +from dataclasses import dataclass, field +from typing import List +from step import Step +from utils import Util +from enrichers import Enricher +from collections import defaultdict + + +@dataclass +class ConfigV2: + # TODO: should Config inherit from Step so it can have it's own configurations? + configurable_parents = [ + Enricher, + Util + ] + feeder : Step #TODO:= BaseFeeder + archivers: List[Step] = field(default_factory=[]) #TODO: fix type + enrichers: List[Enricher] = field(default_factory=[]) + formatters: List[Step] = field(default_factory=[]) #TODO: fix type + storages: List[Step] = field(default_factory=[]) #TODO: fix type + databases: List[Step] = field(default_factory=[]) #TODO: fix type + + def __init__(self) -> None: + self.defaults = {} + self.config = {} + + def parse(self): + # 1. parse CLI values + parser = argparse.ArgumentParser( + # prog = "auto-archiver", + description="Auto Archiver is a ...!", + epilog="Check the code at https://github.com/bellingcat/auto-archiver" + ) + + parser.add_argument('--config', action='store', dest='config', help='the filename of the YAML configuration file (defaults to \'config.yaml\')', default='config.yaml') + + for configurable in self.configurable_parents: + child: Step + for child in configurable.__subclasses__(): + for config, details in child.configs().items(): + assert "." not in child.name, f"class prop name cannot contain dots('.'): {child.name}" + assert "." not in config, f"config property cannot contain dots('.'): {config}" + config_path = f"{child.name}.{config}" + parser.add_argument(f'--{config_path}', action='store', dest=config_path, help=details['help']) + self.defaults[config_path] = details["default"] + + args = parser.parse_args() + + # 2. read YAML config file + with open(args.config, "r", encoding="utf-8") as inf: + self.yaml_config = yaml.safe_load(inf) + + # 3. CONFIGS: decide value with priority: CLI >> config.yaml >> default + self.config = defaultdict(dict) + for config_path, default in self.defaults.items(): + child, config = tuple(config_path.split(".")) + val = getattr(args, config_path) + if val is None: + val = self.yaml_config.get("configurations", {}).get(child, {}).get(config, default) + self.config[child][config] = val + self.config = dict(self.config) + + # 4. STEPS: read steps and validate they exist + steps = self.yaml_config.get("steps", {}) + assert "archivers" in steps, "your configuration steps are missing the archivers property" + assert "storages" in steps, "your configuration steps are missing the storages property" + + print(self.config) + + # self.feeder = Feeder.init + self.enrichers = [Enricher.init(steps.get("enrichers", [])[0], self.config)] + + + print(self.enrichers) + + def validate(self): + pass diff --git a/src/enrichers/__init__.py b/src/enrichers/__init__.py new file mode 100644 index 0000000..3c266f8 --- /dev/null +++ b/src/enrichers/__init__.py @@ -0,0 +1,2 @@ +from .enricher import Enricher +from .enricher_screenshot import ScreenshotEnricher \ No newline at end of file diff --git a/src/enrichers/enricher.py b/src/enrichers/enricher.py new file mode 100644 index 0000000..c767b8e --- /dev/null +++ b/src/enrichers/enricher.py @@ -0,0 +1,20 @@ +from __future__ import annotations +from dataclasses import dataclass +from abc import abstractmethod, ABC +from metadata import Metadata +from step import Step + +@dataclass +class Enricher(Step, ABC): + name = "enricher" + + def __init__(self, config: dict) -> None: + Step.__init__(self) + + + # only for typing... + def init(name: str, config: dict) -> Enricher: + return Step.init(name, config, Enricher) + + @abstractmethod + def enrich(self, item: Metadata) -> Metadata: pass diff --git a/src/enrichers/enricher_screenshot.py b/src/enrichers/enricher_screenshot.py new file mode 100644 index 0000000..04a2bf0 --- /dev/null +++ b/src/enrichers/enricher_screenshot.py @@ -0,0 +1,53 @@ +from . import Enricher +from metadata import Metadata +from loguru import logger + + +class ScreenshotEnricher(Enricher): + name = "screenshot" + + @staticmethod + def configs() -> dict: + return { + "width": {"default": 1280, "help": "width of the screenshots"}, + "height": {"default": 720, "help": "height of the screenshots"}, + } + + def enrich(self, item: Metadata) -> Metadata: + url = self.get_url(item) + print("enrich") + # driver = config.webdriver + # with driver as Webdriver(): # TODO: make a util + # #TODO: take screenshot + # pass + + # logger.debug(f"getting screenshot for {url=}") + # key = self._get_key_from_url(url, ".png", append_datetime=True) + # filename = os.path.join(Storage.TMP_FOLDER, key) + + # # Accept cookies popup dismiss for ytdlp video + # if 'facebook.com' in url: + # try: + # logger.debug(f'Trying fb click accept cookie popup for {url}') + # self.driver.get("http://www.facebook.com") + # foo = self.driver.find_element(By.XPATH, "//button[@data-cookiebanner='accept_only_essential_button']") + # foo.click() + # logger.debug(f'fb click worked') + # # linux server needs a sleep otherwise facebook cookie won't have worked and we'll get a popup on next page + # time.sleep(2) + # except: + # logger.warning(f'Failed on fb accept cookies for url {url}') + + # try: + # self.driver.get(url) + # time.sleep(6) + # except TimeoutException: + # logger.info("TimeoutException loading page for screenshot") + + # self.driver.save_screenshot(filename) + # self.storage.upload(filename, key, extra_args={'ACL': 'public-read', 'ContentType': 'image/png'}) + + # cdn_url = self.storage.get_cdn_url(key) + # self.add_to_media(cdn_url, key) + + # return cdn_url diff --git a/src/metadata.py b/src/metadata.py new file mode 100644 index 0000000..39b62ff --- /dev/null +++ b/src/metadata.py @@ -0,0 +1,30 @@ + +from __future__ import annotations +from typing import Union, Dict +from dataclasses import dataclass + + +@dataclass +class Metadata: + # does not handle files, only primitives + # the only piece of logic to handle files is the archiver, enricher, and storage + status: str + # title: str + # url: str + # hash: str + metadata: Dict[str, Metadata] + + @staticmethod + def merge(left: Metadata, right: Metadata, overwrite_left=True) -> Metadata: + # should return a merged version of the Metadata + # will work for archived() and enriched() + # what if 2 metadatas contain the same keys? only one can remain! : overwrite_left + pass + + def get(self, key: str) -> Union[Metadata, str]: + # goes through metadata and returns the Metadata available + pass + + def as_json(self) -> str: + # converts all metadata and data into JSON + pass diff --git a/src/orchestrator.py b/src/orchestrator.py index 30f7b3c..272919f 100644 --- a/src/orchestrator.py +++ b/src/orchestrator.py @@ -1,5 +1,5 @@ -from typing import Union, Dict from __future__ import annotations +from typing import Union, Dict from dataclasses import dataclass """ @@ -39,31 +39,31 @@ Cisticola considerations: 2. So the auto-archiver becomes like a puzzle and fixes to Cisticola scrapers can immediately benefit it, and contributions are focused on a single source or scraping """ -@dataclass -class Metadata: - # does not handle files, only primitives - # the only piece of logic to handle files is the archiver, enricher, and storage - status: str - # title: str - # url: str - # hash: str - main_file: Metadata - metadata: Dict[str, Metadata] +# @dataclass +# class Metadata: +# # does not handle files, only primitives +# # the only piece of logic to handle files is the archiver, enricher, and storage +# status: str +# # title: str +# # url: str +# # hash: str +# main_file: Metadata +# metadata: Dict[str, Metadata] - @staticmethod - def merge(left, right : Metadata, overwrite_left=True) -> Metadata: - # should return a merged version of the Metadata - # will work for archived() and enriched() - # what if 2 metadatas contain the same keys? only one can remain! : overwrite_left - pass +# @staticmethod +# def merge(left, right : Metadata, overwrite_left=True) -> Metadata: +# # should return a merged version of the Metadata +# # will work for archived() and enriched() +# # what if 2 metadatas contain the same keys? only one can remain! : overwrite_left +# pass - def get(self, key) -> Union[Metadata, str]: - # goes through metadata and returns the Metadata available - pass +# def get(self, key) -> Union[Metadata, str]: +# # goes through metadata and returns the Metadata available +# pass - def as_json(self) -> str: - # converts all metadata and data into JSON - pass +# def as_json(self) -> str: +# # converts all metadata and data into JSON +# pass """ @@ -116,27 +116,27 @@ class ArchivingOrchestrator: # where does that update/processing happen? in config.py # reflection for Archiver to know wihch child classes it has? use Archiver.__subclasses__ self.archivers = [ - Archiver.init(a, config.get(a)) + Archiver.init(a, config) for a in config.archivers ] - self.enrichments = [ - Enrichment.init(e, config.get(e)) - for e in config.enrichments + self.enrichers = [ + Enricher.init(e, config) + for e in config.enrichers ] self.formatters = [ - Formatter.init(f, config.get(f)) + Formatter.init(f, config) for f in config.formatters ] self.storages = [ - Storage.init(s, config.get(s)) + Storage.init(s, config) for s in config.storages ] self.databases = [ - Database.init(f, config.get(f)) + Database.init(f, config) for f in config.formatters ] @@ -192,11 +192,11 @@ class ArchivingOrchestrator: # what if an archiver returns multiple entries and one is to be part of HTMLgenerator? # should it call the HTMLgenerator as if it's not an enrichment? - # eg: if it is enable: generates an HTML with all the returned media, should it include enrichments? yes + # eg: if it is enable: generates an HTML with all the returned media, should it include enrichers? yes # then how to execute it last? should there also be post-processors? are there other examples? # maybe as a PDF? or a Markdown file # side captures: screenshot, wacz, webarchive, thumbnails, HTMLgenerator - for e in enrichments: + for e in enrichers: result.update(e.enrich(result)) # formatters, enrichers, and storages will sometimes look for specific properties: eg
  • Screenshot:
  • diff --git a/src/step.py b/src/step.py new file mode 100644 index 0000000..d717386 --- /dev/null +++ b/src/step.py @@ -0,0 +1,30 @@ +from __future__ import annotations +from dataclasses import dataclass +from typing import Type +from metadata import Metadata +from abc import ABC + + +@dataclass +class Step(ABC): + name : str = None + + def __init__(self, config: dict) -> None: + self.config = self.config[self.name] + + @staticmethod + def configs() -> dict: {} + + def init(name: str, config: dict, child: Type[Step]) -> Step: + """ + cannot find subclasses of child.subclasses + """ + for sub in child.__subclasses__(): + if sub.name == name: + return sub.__init__(config) + raise f"Unable to initialize class with {name=}" + + def get_url(self, item: Metadata) -> str: + url = item.get("url") + assert type(url) is str and len(url) > 0 + return url diff --git a/src/utils/__init__.py b/src/utils/__init__.py index 68010ab..baea5e9 100644 --- a/src/utils/__init__.py +++ b/src/utils/__init__.py @@ -1,3 +1,4 @@ # we need to explicitly expose the available imports here from .gworksheet import * -from .misc import * \ No newline at end of file +from .misc import * +from .util import Util \ No newline at end of file diff --git a/src/utils/util.py b/src/utils/util.py new file mode 100644 index 0000000..9ad5b53 --- /dev/null +++ b/src/utils/util.py @@ -0,0 +1,20 @@ +from __future__ import annotations +from dataclasses import dataclass +from abc import abstractmethod, ABC +from metadata import Metadata +from step import Step + +@dataclass +class Util(Step, ABC): + name = "util" + + def __init__(self, config: dict) -> None: + Step.__init__(self) + + + # only for typing... + def init(name: str, config: dict) -> Util: + return super().init(name, config, Util) + + @abstractmethod + def enrich(self, item: Metadata) -> Metadata: pass diff --git a/src/v2.py b/src/v2.py new file mode 100644 index 0000000..8fa544f --- /dev/null +++ b/src/v2.py @@ -0,0 +1,9 @@ + + +from configs.v2config import ConfigV2 +from orchestrator import ArchivingOrchestrator + +config = ConfigV2() +config.parse() + +# orchestrator = ArchivingOrchestrator(config) \ No newline at end of file From 618e7ed0a3a70446ce9e3c99f7f7359c26bf057d Mon Sep 17 00:00:00 2001 From: msramalho <19508417+msramalho@users.noreply.github.com> Date: Thu, 24 Nov 2022 11:53:21 +0000 Subject: [PATCH 022/190] subproperties in config --- src/configs/v2config.py | 76 ++++++++++++++++++++++++++++++----------- 1 file changed, 56 insertions(+), 20 deletions(-) diff --git a/src/configs/v2config.py b/src/configs/v2config.py index 4b072c3..bce5669 100644 --- a/src/configs/v2config.py +++ b/src/configs/v2config.py @@ -3,6 +3,7 @@ import argparse, yaml from dataclasses import dataclass, field from typing import List +from feeders.feeder import Feeder from step import Step from utils import Util from enrichers import Enricher @@ -13,15 +14,16 @@ from collections import defaultdict class ConfigV2: # TODO: should Config inherit from Step so it can have it's own configurations? configurable_parents = [ + Feeder, Enricher, - Util + # Util ] - feeder : Step #TODO:= BaseFeeder - archivers: List[Step] = field(default_factory=[]) #TODO: fix type + feeder: Step # TODO:= BaseFeeder + archivers: List[Step] = field(default_factory=[]) # TODO: fix type enrichers: List[Enricher] = field(default_factory=[]) - formatters: List[Step] = field(default_factory=[]) #TODO: fix type - storages: List[Step] = field(default_factory=[]) #TODO: fix type - databases: List[Step] = field(default_factory=[]) #TODO: fix type + formatters: List[Step] = field(default_factory=[]) # TODO: fix type + storages: List[Step] = field(default_factory=[]) # TODO: fix type + databases: List[Step] = field(default_factory=[]) # TODO: fix type def __init__(self) -> None: self.defaults = {} @@ -39,13 +41,27 @@ class ConfigV2: for configurable in self.configurable_parents: child: Step + # print(f"{configurable=}") for child in configurable.__subclasses__(): + # print(f"{child=} {child.configs()=}") + for config, details in child.configs().items(): + print(config, details) assert "." not in child.name, f"class prop name cannot contain dots('.'): {child.name}" assert "." not in config, f"config property cannot contain dots('.'): {config}" + if (is_nested := type(details["default"]) == dict): + for subconfig, subdefault in details["default"].items(): + assert "." not in subconfig, f"config subproperty cannot contain dots('.'): {subconfig}" + config_path = f"{child.name}.{config}.{subconfig}" + parser.add_argument(f'--{config_path}', action='store', dest=config_path, help=details['help'] + f"({subconfig})") + self.defaults[config_path] = subdefault + config_path = f"{child.name}.{config}" - parser.add_argument(f'--{config_path}', action='store', dest=config_path, help=details['help']) + print(config_path) self.defaults[config_path] = details["default"] + if not is_nested: + # nested cannot be directly set on the CLI + parser.add_argument(f'--{config_path}', action='store', dest=config_path, help=details['help']) args = parser.parse_args() @@ -53,28 +69,48 @@ class ConfigV2: with open(args.config, "r", encoding="utf-8") as inf: self.yaml_config = yaml.safe_load(inf) + # print(f"{self.yaml_config.get('configurations', {})=}") # 3. CONFIGS: decide value with priority: CLI >> config.yaml >> default self.config = defaultdict(dict) for config_path, default in self.defaults.items(): - child, config = tuple(config_path.split(".")) - val = getattr(args, config_path) - if val is None: - val = self.yaml_config.get("configurations", {}).get(child, {}).get(config, default) - self.config[child][config] = val + config_steps = config_path.split(".") + if len(config_steps) == 2: # not nested + child, config = tuple(config_steps) + val = getattr(args, config_path, None) + if val is None: + val = self.yaml_config.get("configurations", {}).get(child, {}).get(config, default) + # self.config[child][config] = val + + elif len(config_steps) == 3: # nested + child, config, subconfig = tuple(config_steps) + val = getattr(args, config_path) + if config not in self.config[child]: + self.config[child][config] = {} + if val is None: + val = self.yaml_config.get("configurations", {}).get(child, {}).get(config, {}).get(subconfig, default) + print(child, config, subconfig, val) + self.config[child][config][subconfig] = val + + # child, config = tuple(config_path.split(".")) + # # print(config_path) + # val = getattr(args, config_path) + # # print(child, config, val) + # if val is None: + # val = self.yaml_config.get("configurations", {}).get(child, {}).get(config, default) + # self.config[child][config] = val self.config = dict(self.config) # 4. STEPS: read steps and validate they exist steps = self.yaml_config.get("steps", {}) assert "archivers" in steps, "your configuration steps are missing the archivers property" assert "storages" in steps, "your configuration steps are missing the storages property" - - print(self.config) - - # self.feeder = Feeder.init - self.enrichers = [Enricher.init(steps.get("enrichers", [])[0], self.config)] - - - print(self.enrichers) + + print("config.py", self.config) + + self.feeder = Feeder.init(steps.get("feeder", "cli_feeder"), self.config) + self.enrichers = [Enricher.init(e, self.config) for e in steps.get("enrichers", [])] + + print("enrichers", [e for e in self.enrichers]) def validate(self): pass From 9dc709d3b90603c205b081847362f19027d123f4 Mon Sep 17 00:00:00 2001 From: msramalho <19508417+msramalho@users.noreply.github.com> Date: Thu, 24 Nov 2022 15:44:25 +0000 Subject: [PATCH 023/190] demo feeder logic working --- orchestrate.yaml | 34 ++++++++-- src/archivers/telethon_archiver.py | 2 +- src/configs/v2config.py | 56 +++++----------- src/enrichers/enricher.py | 3 +- src/feeders/__init__.py | 2 + src/feeders/feeder.py | 23 +++++++ src/feeders/feeder_gsheet.py | 101 +++++++++++++++++++++++++++++ src/orchestrator.py | 54 +++++++-------- src/step.py | 16 +++-- src/utils/__init__.py | 2 +- src/utils/util.py | 3 +- src/v2.py | 5 +- 12 files changed, 216 insertions(+), 85 deletions(-) create mode 100644 src/feeders/__init__.py create mode 100644 src/feeders/feeder.py create mode 100644 src/feeders/feeder_gsheet.py diff --git a/orchestrate.yaml b/orchestrate.yaml index 689765f..3a2bc27 100644 --- a/orchestrate.yaml +++ b/orchestrate.yaml @@ -10,9 +10,9 @@ steps: - webarchive # this way it runs as a failsafe only enrichers: - screenshot - - wacz - - webarchive # this way it runs for every case, webarchive extends archiver and enrichment - - thumbnails + # - wacz + # - webarchive # this way it runs for every case, webarchive extends archiver and enrichment + # - thumbnails formatters: - HTMLFormater - PdfFormater @@ -29,10 +29,32 @@ configurations: global: - save_logs: False gsheets_feeder: - - sheet: "Auto archiver" - - header: "" # defaults to 1 in GSheetsFeeder - - service_account: "secrets/service_account.json" + sheet: auto-archiver-test + header: 2 # defaults to 1 in GSheetsFeeder + service_account: "secrets/service_account.json" + allow_worksheets: "aa-refactor-tests" + block_worksheets: "blocked,test-cases-008" + columns: + 'url': 'link' + 'status': 'archive status' + 'folder': 'destination folder' + 'archive': 'archive location' + 'date': 'archive date' + 'thumbnail': 'thumbnail' + 'thumbnail_index': 'thumbnail index' + 'timestamp': 'upload timestamp' + 'title': 'upload title' + 'duration': 'duration' + 'screenshot': 'screenshot' + 'hash': 'hash' + 'wacz': 'wacz' + 'replaywebpage': 'replaywebpage' tiktok: + api_keys: + - username: 1 + password: 2 + - username: 3 + password: 4 username: "abc" password: "123" token: "here" diff --git a/src/archivers/telethon_archiver.py b/src/archivers/telethon_archiver.py index f0ff194..a2cbf0a 100644 --- a/src/archivers/telethon_archiver.py +++ b/src/archivers/telethon_archiver.py @@ -17,7 +17,7 @@ class TelethonArchiver(Archiver): super().__init__(storage, config) if config.telegram_config: c = config.telegram_config - self.client = TelegramClient("./anon", c.api_id, c.api_hash) + self.client = TelegramClient("./anon.session", c.api_id, c.api_hash) self.bot_token = c.bot_token def _get_media_posts_in_group(self, chat, original_post, max_amp=10): diff --git a/src/configs/v2config.py b/src/configs/v2config.py index bce5669..9eb35df 100644 --- a/src/configs/v2config.py +++ b/src/configs/v2config.py @@ -27,8 +27,10 @@ class ConfigV2: def __init__(self) -> None: self.defaults = {} + self.cli_ops = {} self.config = {} + # TODO: make this work for nested props like gsheets_feeder.columns.url = "URL" def parse(self): # 1. parse CLI values parser = argparse.ArgumentParser( @@ -41,27 +43,15 @@ class ConfigV2: for configurable in self.configurable_parents: child: Step - # print(f"{configurable=}") for child in configurable.__subclasses__(): - # print(f"{child=} {child.configs()=}") - for config, details in child.configs().items(): - print(config, details) assert "." not in child.name, f"class prop name cannot contain dots('.'): {child.name}" assert "." not in config, f"config property cannot contain dots('.'): {config}" - if (is_nested := type(details["default"]) == dict): - for subconfig, subdefault in details["default"].items(): - assert "." not in subconfig, f"config subproperty cannot contain dots('.'): {subconfig}" - config_path = f"{child.name}.{config}.{subconfig}" - parser.add_argument(f'--{config_path}', action='store', dest=config_path, help=details['help'] + f"({subconfig})") - self.defaults[config_path] = subdefault - config_path = f"{child.name}.{config}" - print(config_path) + parser.add_argument(f'--{config_path}', action='store', dest=config_path, help=details['help']) self.defaults[config_path] = details["default"] - if not is_nested: - # nested cannot be directly set on the CLI - parser.add_argument(f'--{config_path}', action='store', dest=config_path, help=details['help']) + if "cli_set" in details: + self.cli_ops[config_path] = details["cli_set"] args = parser.parse_args() @@ -73,31 +63,14 @@ class ConfigV2: # 3. CONFIGS: decide value with priority: CLI >> config.yaml >> default self.config = defaultdict(dict) for config_path, default in self.defaults.items(): - config_steps = config_path.split(".") - if len(config_steps) == 2: # not nested - child, config = tuple(config_steps) - val = getattr(args, config_path, None) - if val is None: - val = self.yaml_config.get("configurations", {}).get(child, {}).get(config, default) - # self.config[child][config] = val - - elif len(config_steps) == 3: # nested - child, config, subconfig = tuple(config_steps) - val = getattr(args, config_path) - if config not in self.config[child]: - self.config[child][config] = {} - if val is None: - val = self.yaml_config.get("configurations", {}).get(child, {}).get(config, {}).get(subconfig, default) - print(child, config, subconfig, val) - self.config[child][config][subconfig] = val - - # child, config = tuple(config_path.split(".")) - # # print(config_path) - # val = getattr(args, config_path) - # # print(child, config, val) - # if val is None: - # val = self.yaml_config.get("configurations", {}).get(child, {}).get(config, default) - # self.config[child][config] = val + child, config = tuple(config_path.split(".")) + val = getattr(args, config_path) + if val is not None and config_path in self.cli_ops: + val = self.cli_ops[config_path](val, default) + if val is None: + val = self.yaml_config.get("configurations", {}).get(child, {}).get(config, default) + # print(child, config, val) + self.config[child][config] = val self.config = dict(self.config) # 4. STEPS: read steps and validate they exist @@ -105,11 +78,12 @@ class ConfigV2: assert "archivers" in steps, "your configuration steps are missing the archivers property" assert "storages" in steps, "your configuration steps are missing the storages property" - print("config.py", self.config) + # print("config.py", self.config) self.feeder = Feeder.init(steps.get("feeder", "cli_feeder"), self.config) self.enrichers = [Enricher.init(e, self.config) for e in steps.get("enrichers", [])] + print("feeder", self.feeder) print("enrichers", [e for e in self.enrichers]) def validate(self): diff --git a/src/enrichers/enricher.py b/src/enrichers/enricher.py index c767b8e..baa22e3 100644 --- a/src/enrichers/enricher.py +++ b/src/enrichers/enricher.py @@ -9,7 +9,8 @@ class Enricher(Step, ABC): name = "enricher" def __init__(self, config: dict) -> None: - Step.__init__(self) + # without this STEP.__init__ is not called + super().__init__(config) # only for typing... diff --git a/src/feeders/__init__.py b/src/feeders/__init__.py new file mode 100644 index 0000000..9fb5942 --- /dev/null +++ b/src/feeders/__init__.py @@ -0,0 +1,2 @@ +from.feeder import Feeder +from .feeder_gsheet import GsheetsFeeder \ No newline at end of file diff --git a/src/feeders/feeder.py b/src/feeders/feeder.py new file mode 100644 index 0000000..6b7ba10 --- /dev/null +++ b/src/feeders/feeder.py @@ -0,0 +1,23 @@ +from __future__ import annotations +from dataclasses import dataclass +from abc import abstractmethod +# from metadata import Metadata +from step import Step + + +@dataclass +class Feeder(Step): + name = "feeder" + + def __init__(self, config: dict) -> None: + # without this STEP.__init__ is not called + super().__init__(config) + + def init(name: str, config: dict) -> Feeder: + # only for code typing + return Step.init(name, config, Feeder) + + # def feed(self, item: Metadata) -> Metadata: pass + + @abstractmethod + def __iter__(self) -> Feeder: return None \ No newline at end of file diff --git a/src/feeders/feeder_gsheet.py b/src/feeders/feeder_gsheet.py new file mode 100644 index 0000000..7ebc640 --- /dev/null +++ b/src/feeders/feeder_gsheet.py @@ -0,0 +1,101 @@ +import json, gspread + +# from metadata import Metadata +from loguru import logger + +# from . import Enricher +from feeders.feeder import Feeder +from utils import GWorksheet + + +class GsheetsFeeder(Feeder): + name = "gsheets_feeder" + + def __init__(self, config: dict) -> None: + # without this STEP.__init__ is not called + super().__init__(config) + self.gsheets_client = gspread.service_account(filename=self.service_account) + assert type(self.header) == int, f"header ({self.header}) value must be an integer not {type(self.header)}" + + @staticmethod + def configs() -> dict: + return { + "sheet": {"default": None, "help": "name of the sheet to archive"}, + "header": {"default": 1, "help": "index of the header row (starts at 1)"}, + "service_account": {"default": "secrets/service_account.json", "help": "service account JSON file path"}, + "allow_worksheets": { + "default": set(), + "help": "(CSV) only worksheets whose name is included in allow are included (overrides worksheet_block), leave empty so all are allowed", + "cli_set": lambda cli_val, cur_val: set(cli_val.split(",")) + }, + "block_worksheets": { + "default": set(), + "help": "(CSV) explicitly block some worksheets from being processed, defaults to empty", + "cli_set": lambda cli_val, cur_val: set(cli_val.split(",")) + }, + "columns": { + "default": { + 'url': 'link', + 'status': 'archive status', + 'folder': 'destination folder', + 'archive': 'archive location', + 'date': 'archive date', + 'thumbnail': 'thumbnail', + 'thumbnail_index': 'thumbnail index', + 'timestamp': 'upload timestamp', + 'title': 'upload title', + 'duration': 'duration', + 'screenshot': 'screenshot', + 'hash': 'hash', + 'wacz': 'wacz', + 'replaywebpage': 'replaywebpage', + }, + "help": "names of columns in the google sheet", + "cli_set": lambda cli_val, cur_val: dict(cur_val, **json.loads(cli_val)) + }, + } + def __iter__(self) -> str: + sh = self.gsheets_client.open(self.sheet) + for ii, wks in enumerate(sh.worksheets()): + if not self.should_process_sheet(wks.title): + logger.debug(f"SKIPPED worksheet '{wks.title}' due to allow/block rules") + continue + + logger.info(f'Opening worksheet {ii=}: {wks.title=} header={self.header}') + gw = GWorksheet(wks, header_row=self.header, columns=self.columns) + + if len(missing_cols := self.missing_required_columns(gw)): + logger.warning(f"SKIPPED worksheet '{wks.title}' due to missing required column(s) for {missing_cols}") + continue + + for row in range(1 + self.header, gw.count_rows() + 1): + url = gw.get_cell(row, 'url').strip() + if not len(url): continue + #TODO: gsheet_db should check later if this is supposed to be archived + # static_status = gw.get_cell(row, 'status') + # status = gw.get_cell(row, 'status', fresh=static_status in ['', None] and url != '') + # All checks done - archival process starts here + yield url + logger.success(f'Finished worksheet {wks.title}') + + # GWorksheet(self.sheet) + print(self.sheet) + for u in ["url1", "url2"]: + yield u + + + def should_process_sheet(self, sheet_name: str) -> bool: + if len(self.allow_worksheets) and sheet_name not in self.allow_worksheets: + # ALLOW rules exist AND sheet name not explicitly allowed + return False + if len(self.block_worksheets) and sheet_name in self.block_worksheets: + # BLOCK rules exist AND sheet name is blocked + return False + return True + + def missing_required_columns(self, gw: GWorksheet) -> list: + missing = [] + for required_col in ['url', 'status']: + if not gw.col_exists(required_col): + missing.append(required_col) + return missing diff --git a/src/orchestrator.py b/src/orchestrator.py index 272919f..f32f4c9 100644 --- a/src/orchestrator.py +++ b/src/orchestrator.py @@ -2,6 +2,8 @@ from __future__ import annotations from typing import Union, Dict from dataclasses import dataclass +from enrichers.enricher import Enricher + """ how not to couple the different pieces of logic due to the use of constants for the metadata keys? @@ -110,49 +112,47 @@ class ArchivingOrchestrator: def __init__(self, config) -> None: # in config.py we should test that the archivers exist and log mismatches (blocking execution) # identify each formatter, storage, database, etc - self.feeder = Feeder.init(config.feeder, config.get(config.feeder)) + # self.feeder = Feeder.init(config.feeder, config.get(config.feeder)) # Is it possible to overwrite config.yaml values? it could be useful: share config file and modify gsheets_feeder.sheet via CLI # where does that update/processing happen? in config.py # reflection for Archiver to know wihch child classes it has? use Archiver.__subclasses__ - self.archivers = [ - Archiver.init(a, config) - for a in config.archivers - ] + # self.archivers = [ + # Archiver.init(a, config) + # for a in config.archivers + # ] + self.feeder = config.feeder + self.enrichers = config.enrichers - self.enrichers = [ - Enricher.init(e, config) - for e in config.enrichers - ] + # self.formatters = [ + # Formatter.init(f, config) + # for f in config.formatters + # ] - self.formatters = [ - Formatter.init(f, config) - for f in config.formatters - ] + # self.storages = [ + # Storage.init(s, config) + # for s in config.storages + # ] - self.storages = [ - Storage.init(s, config) - for s in config.storages - ] - - self.databases = [ - Database.init(f, config) - for f in config.formatters - ] + # self.databases = [ + # Database.init(f, config) + # for f in config.formatters + # ] # these rules are checked in config.py - assert len(archivers) > 1, "there needs to be at least one Archiver" + # assert len(archivers) > 1, "there needs to be at least one Archiver" - def feed(self, feeder: Feeder) -> list(ArchiveResult): - for next in feeder: - self.archive(next) + def feed(self) -> list(ArchiveResult): + for url in self.feeder: + print("ARCHIVING", url) + # self.archive(url) # how does this handle the parameters like folder which can be different for each archiver? # the storage needs to know where to archive!! # solution: feeders have context: extra metadata that they can read or ignore, # all of it should have sensible defaults (eg: folder) # default feeder is a list with 1 element - def archive(url) -> Union[ArchiveResult, None]: + def archive(self, url) -> Union[ArchiveResult, None]: url = clear_url(url) result = Metadata(url=url) diff --git a/src/step.py b/src/step.py index d717386..04d7a61 100644 --- a/src/step.py +++ b/src/step.py @@ -1,16 +1,21 @@ from __future__ import annotations -from dataclasses import dataclass +from dataclasses import dataclass, field +from inspect import ClassFoundException from typing import Type from metadata import Metadata from abc import ABC +# from collections.abc import Iterable @dataclass class Step(ABC): - name : str = None + name: str = None def __init__(self, config: dict) -> None: - self.config = self.config[self.name] + # reads the configs into object properties + # self.config = config[self.name] + for k, v in config[self.name].items(): + self.__setattr__(k, v) @staticmethod def configs() -> dict: {} @@ -21,8 +26,9 @@ class Step(ABC): """ for sub in child.__subclasses__(): if sub.name == name: - return sub.__init__(config) - raise f"Unable to initialize class with {name=}" + print(sub.name, "CALLING NEW") + return sub(config) + raise ClassFoundException(f"Unable to initialize STEP with {name=}") def get_url(self, item: Metadata) -> str: url = item.get("url") diff --git a/src/utils/__init__.py b/src/utils/__init__.py index baea5e9..ad56f36 100644 --- a/src/utils/__init__.py +++ b/src/utils/__init__.py @@ -1,4 +1,4 @@ # we need to explicitly expose the available imports here -from .gworksheet import * +from .gworksheet import GWorksheet from .misc import * from .util import Util \ No newline at end of file diff --git a/src/utils/util.py b/src/utils/util.py index 9ad5b53..51bb2e3 100644 --- a/src/utils/util.py +++ b/src/utils/util.py @@ -11,8 +11,7 @@ class Util(Step, ABC): def __init__(self, config: dict) -> None: Step.__init__(self) - - # only for typing... + # only for typing... def init(name: str, config: dict) -> Util: return super().init(name, config, Util) diff --git a/src/v2.py b/src/v2.py index 8fa544f..8ecb820 100644 --- a/src/v2.py +++ b/src/v2.py @@ -1,9 +1,12 @@ +from abc import ABC from configs.v2config import ConfigV2 from orchestrator import ArchivingOrchestrator config = ConfigV2() config.parse() -# orchestrator = ArchivingOrchestrator(config) \ No newline at end of file +orchestrator = ArchivingOrchestrator(config) + +orchestrator.feed() From 955891a411cb2bd96a477f3751472776995b101a Mon Sep 17 00:00:00 2001 From: msramalho <19508417+msramalho@users.noreply.github.com> Date: Sat, 10 Dec 2022 12:03:46 +0000 Subject: [PATCH 024/190] WIP feeder --- orchestrate.yaml | 8 +++--- src/configs/v2config.py | 2 +- src/databases/database.py | 21 ++++++++++++++ src/enrichers/enricher.py | 2 +- src/feeders/feeder.py | 2 +- src/feeders/feeder_gsheet.py | 54 ++++++++++++------------------------ src/steps/gsheet.py | 42 ++++++++++++++++++++++++++++ src/{ => steps}/step.py | 0 src/utils/util.py | 2 +- 9 files changed, 88 insertions(+), 45 deletions(-) create mode 100644 src/databases/database.py create mode 100644 src/steps/gsheet.py rename src/{ => steps}/step.py (100%) diff --git a/orchestrate.yaml b/orchestrate.yaml index 3a2bc27..9626e83 100644 --- a/orchestrate.yaml +++ b/orchestrate.yaml @@ -3,11 +3,11 @@ steps: # a feeder could be in an "infinite loop" for example: gsheets_infinite feeder which holds-> this could be an easy logic addiction by modifying for each to while not feeder.done() if it becomes necessary feeder: gsheets_feeder # default -> only expects URL from CLI archivers: # order matters - - tiktok - telethon - - twitter - - instagram - - webarchive # this way it runs as a failsafe only + # - tiktok + # - twitter + # - instagram + # - webarchive # this way it runs as a failsafe only enrichers: - screenshot # - wacz diff --git a/src/configs/v2config.py b/src/configs/v2config.py index 9eb35df..50c8b0f 100644 --- a/src/configs/v2config.py +++ b/src/configs/v2config.py @@ -4,7 +4,7 @@ import argparse, yaml from dataclasses import dataclass, field from typing import List from feeders.feeder import Feeder -from step import Step +from steps.step import Step from utils import Util from enrichers import Enricher from collections import defaultdict diff --git a/src/databases/database.py b/src/databases/database.py new file mode 100644 index 0000000..15f8d0d --- /dev/null +++ b/src/databases/database.py @@ -0,0 +1,21 @@ +from __future__ import annotations +from dataclasses import dataclass +from abc import abstractmethod, ABC +from metadata import Metadata +from steps.step import Step + +@dataclass +class Database(Step, ABC): + name = "database" + + def __init__(self, config: dict) -> None: + # without this STEP.__init__ is not called + super().__init__(config) + + + # only for typing... + def init(name: str, config: dict) -> Database: + return Step.init(name, config, Database) + + @abstractmethod + def enrich(self, item: Metadata) -> Metadata: pass diff --git a/src/enrichers/enricher.py b/src/enrichers/enricher.py index baa22e3..faf43d8 100644 --- a/src/enrichers/enricher.py +++ b/src/enrichers/enricher.py @@ -2,7 +2,7 @@ from __future__ import annotations from dataclasses import dataclass from abc import abstractmethod, ABC from metadata import Metadata -from step import Step +from steps.step import Step @dataclass class Enricher(Step, ABC): diff --git a/src/feeders/feeder.py b/src/feeders/feeder.py index 6b7ba10..d930ba0 100644 --- a/src/feeders/feeder.py +++ b/src/feeders/feeder.py @@ -2,7 +2,7 @@ from __future__ import annotations from dataclasses import dataclass from abc import abstractmethod # from metadata import Metadata -from step import Step +from steps.step import Step @dataclass diff --git a/src/feeders/feeder_gsheet.py b/src/feeders/feeder_gsheet.py index 7ebc640..a99376f 100644 --- a/src/feeders/feeder_gsheet.py +++ b/src/feeders/feeder_gsheet.py @@ -5,10 +5,11 @@ from loguru import logger # from . import Enricher from feeders.feeder import Feeder +from steps.gsheet import Gsheets from utils import GWorksheet -class GsheetsFeeder(Feeder): +class GsheetsFeeder(Gsheets, Feeder): name = "gsheets_feeder" def __init__(self, config: dict) -> None: @@ -19,41 +20,21 @@ class GsheetsFeeder(Feeder): @staticmethod def configs() -> dict: - return { - "sheet": {"default": None, "help": "name of the sheet to archive"}, - "header": {"default": 1, "help": "index of the header row (starts at 1)"}, - "service_account": {"default": "secrets/service_account.json", "help": "service account JSON file path"}, - "allow_worksheets": { - "default": set(), - "help": "(CSV) only worksheets whose name is included in allow are included (overrides worksheet_block), leave empty so all are allowed", - "cli_set": lambda cli_val, cur_val: set(cli_val.split(",")) - }, - "block_worksheets": { - "default": set(), - "help": "(CSV) explicitly block some worksheets from being processed, defaults to empty", - "cli_set": lambda cli_val, cur_val: set(cli_val.split(",")) - }, - "columns": { - "default": { - 'url': 'link', - 'status': 'archive status', - 'folder': 'destination folder', - 'archive': 'archive location', - 'date': 'archive date', - 'thumbnail': 'thumbnail', - 'thumbnail_index': 'thumbnail index', - 'timestamp': 'upload timestamp', - 'title': 'upload title', - 'duration': 'duration', - 'screenshot': 'screenshot', - 'hash': 'hash', - 'wacz': 'wacz', - 'replaywebpage': 'replaywebpage', + return dict( + Gsheets.configs(), + ** { + "allow_worksheets": { + "default": set(), + "help": "(CSV) only worksheets whose name is included in allow are included (overrides worksheet_block), leave empty so all are allowed", + "cli_set": lambda cli_val, cur_val: set(cli_val.split(",")) }, - "help": "names of columns in the google sheet", - "cli_set": lambda cli_val, cur_val: dict(cur_val, **json.loads(cli_val)) - }, - } + "block_worksheets": { + "default": set(), + "help": "(CSV) explicitly block some worksheets from being processed, defaults to empty", + "cli_set": lambda cli_val, cur_val: set(cli_val.split(",")) + } + }) + def __iter__(self) -> str: sh = self.gsheets_client.open(self.sheet) for ii, wks in enumerate(sh.worksheets()): @@ -71,7 +52,7 @@ class GsheetsFeeder(Feeder): for row in range(1 + self.header, gw.count_rows() + 1): url = gw.get_cell(row, 'url').strip() if not len(url): continue - #TODO: gsheet_db should check later if this is supposed to be archived + # TODO: gsheet_db should check later if this is supposed to be archived # static_status = gw.get_cell(row, 'status') # status = gw.get_cell(row, 'status', fresh=static_status in ['', None] and url != '') # All checks done - archival process starts here @@ -83,7 +64,6 @@ class GsheetsFeeder(Feeder): for u in ["url1", "url2"]: yield u - def should_process_sheet(self, sheet_name: str) -> bool: if len(self.allow_worksheets) and sheet_name not in self.allow_worksheets: # ALLOW rules exist AND sheet name not explicitly allowed diff --git a/src/steps/gsheet.py b/src/steps/gsheet.py new file mode 100644 index 0000000..9654da4 --- /dev/null +++ b/src/steps/gsheet.py @@ -0,0 +1,42 @@ +import json, gspread + +from loguru import logger +from steps.step import Step + + +class Gsheets(Step): + name = "gsheets" + + def __init__(self, config: dict) -> None: + # without this STEP.__init__ is not called + super().__init__(config) + self.gsheets_client = gspread.service_account(filename=self.service_account) + assert type(self.header) == int, f"header ({self.header}) value must be an integer not {type(self.header)}" + + @staticmethod + def configs() -> dict: + return { + "sheet": {"default": None, "help": "name of the sheet to archive"}, + "header": {"default": 1, "help": "index of the header row (starts at 1)"}, + "service_account": {"default": "secrets/service_account.json", "help": "service account JSON file path"}, + "columns": { + "default": { + 'url': 'link', + 'status': 'archive status', + 'folder': 'destination folder', + 'archive': 'archive location', + 'date': 'archive date', + 'thumbnail': 'thumbnail', + 'thumbnail_index': 'thumbnail index', + 'timestamp': 'upload timestamp', + 'title': 'upload title', + 'duration': 'duration', + 'screenshot': 'screenshot', + 'hash': 'hash', + 'wacz': 'wacz', + 'replaywebpage': 'replaywebpage', + }, + "help": "names of columns in the google sheet", + "cli_set": lambda cli_val, cur_val: dict(cur_val, **json.loads(cli_val)) + }, + } \ No newline at end of file diff --git a/src/step.py b/src/steps/step.py similarity index 100% rename from src/step.py rename to src/steps/step.py diff --git a/src/utils/util.py b/src/utils/util.py index 51bb2e3..714d499 100644 --- a/src/utils/util.py +++ b/src/utils/util.py @@ -2,7 +2,7 @@ from __future__ import annotations from dataclasses import dataclass from abc import abstractmethod, ABC from metadata import Metadata -from step import Step +from steps.step import Step @dataclass class Util(Step, ABC): From b3860cfec10f5f9924d146ca1c819d9f2e9aef3f Mon Sep 17 00:00:00 2001 From: msramalho <19508417+msramalho@users.noreply.github.com> Date: Wed, 14 Dec 2022 14:01:39 +0000 Subject: [PATCH 025/190] telethon join channels working --- .gitignore | 3 +- Pipfile | 1 + Pipfile.lock | 457 ++++++++++-------- README.md | 6 + ...estrate.yaml => orchestration.example.yaml | 22 +- src/archivers/__init__.py | 5 +- src/archivers/archiver.py | 26 + src/archivers/telethon_archiverv2.py | 114 +++++ src/configs/v2config.py | 11 +- src/enrichers/enricher_screenshot.py | 49 +- src/feeders/feeder_gsheet.py | 4 +- src/metadata.py | 22 +- src/orchestrator.py | 59 ++- src/steps/gsheet.py | 2 +- src/utils/__init__.py | 3 +- src/utils/util.py | 5 +- src/utils/webdriver.py | 45 ++ 17 files changed, 539 insertions(+), 295 deletions(-) rename orchestrate.yaml => orchestration.example.yaml (80%) create mode 100644 src/archivers/archiver.py create mode 100644 src/archivers/telethon_archiverv2.py create mode 100644 src/utils/webdriver.py diff --git a/.gitignore b/.gitignore index 59ed096..88ccd0e 100644 --- a/.gitignore +++ b/.gitignore @@ -23,4 +23,5 @@ secrets/* browsertrix/* browsertrix-tmp/* instaloader/* -instaloader.session \ No newline at end of file +instaloader.session +orchestration.yaml \ No newline at end of file diff --git a/Pipfile b/Pipfile index aa04ea4..2095f2b 100644 --- a/Pipfile +++ b/Pipfile @@ -26,6 +26,7 @@ dateparser = "*" vk-url-scraper = "*" python-twitter-v2 = "*" instaloader = "*" +tqdm = "*" [requires] python_version = "3.9" diff --git a/Pipfile.lock b/Pipfile.lock index 6aac097..5bfeba7 100644 --- a/Pipfile.lock +++ b/Pipfile.lock @@ -1,7 +1,7 @@ { "_meta": { "hash": { - "sha256": "bd987e7237c7e32d2dffb295db633f5a022ce1a718435d11d8ac303c9e37a4d3" + "sha256": "60b8f39d7a466e194c98a3fb6a03f74f03b108f5fac4cce8657c5ffdf6a02962" }, "pipfile-spec": 6, "requires": { @@ -57,22 +57,24 @@ }, "boto3": { "hashes": [ - "sha256:3b0fa19390895e664045713f2e47e63ad29c9f98b7bee6836dec7124953e48b8", - "sha256:9feb98e045736f943c2099d955415cfe44133e03d8e2d7581d2e5dc74d0ed064" + "sha256:53badfc5f145b8a3f9117512b41bc5a64db1cce1b549061d8edba68909e63fdf", + "sha256:548081a0f8854bb2eea1e368ab29945478105f56989546f653c75528dcb07d88" ], "index": "pypi", - "version": "==1.26.1" + "version": "==1.26.28" }, "botocore": { "hashes": [ - "sha256:75c65130ffab527d0a3d948c6d87eb8eac210e079e1ff2768c66484be57bb77c", - "sha256:e38b7cdce927cefabe45608dde61660b76458fba6624240dcdb6c4b8453d17f7" + "sha256:982732e7ed65cb6ed11ea3ce0e32dff2bcd465836c32376154f0802aa0a112c7", + "sha256:f0b8bb976e368dea20a960b47169e31fc0828feb6f0b9f59f1e5be8d08919b10" ], "markers": "python_version >= '3.7'", - "version": "==1.29.1" + "version": "==1.29.28" }, "brotli": { "hashes": [ + "sha256:02177603aaca36e1fd21b091cb742bb3b305a569e2402f1ca38af471777fb019", + "sha256:11d3283d89af7033236fa4e73ec2cbe743d4f6a81d41bd234f24bf63dde979df", "sha256:12effe280b8ebfd389022aa65114e30407540ccb89b177d3fbc9a4f177c4bd5d", "sha256:160c78292e98d21e73a4cc7f76a234390e516afcd982fa17e1422f7c6a9ce9c8", "sha256:16d528a45c2e1909c2798f27f7bf0a3feec1dc9e50948e738b961618e38b6a7b", @@ -83,9 +85,15 @@ "sha256:26d168aac4aaec9a4394221240e8a5436b5634adc3cd1cdf637f6645cecbf181", "sha256:29d1d350178e5225397e28ea1b7aca3648fcbab546d20e7475805437bfb0a130", "sha256:2aad0e0baa04517741c9bb5b07586c642302e5fb3e75319cb62087bd0995ab19", + "sha256:3148362937217b7072cf80a2dcc007f09bb5ecb96dae4617316638194113d5be", + "sha256:330e3f10cd01da535c70d09c4283ba2df5fb78e915bea0a28becad6e2ac010be", + "sha256:336b40348269f9b91268378de5ff44dc6fbaa2268194f85177b53463d313842a", "sha256:3496fc835370da351d37cada4cf744039616a6db7d13c430035e901443a34daa", "sha256:35a3edbe18e876e596553c4007a087f8bcfd538f19bc116917b3c7522fca0429", "sha256:3b78a24b5fd13c03ee2b7b86290ed20efdc95da75a3557cc06811764d5ad1126", + "sha256:3b8b09a16a1950b9ef495a0f8b9d0a87599a9d1f179e2d4ac014b2ec831f87e7", + "sha256:3c1306004d49b84bd0c4f90457c6f57ad109f5cc6067a9664e12b7b79a9948ad", + "sha256:3ffaadcaeafe9d30a7e4e1e97ad727e4f5610b9fa2f7551998471e3736738679", "sha256:40d15c79f42e0a2c72892bf407979febd9cf91f36f495ffb333d1d04cebb34e4", "sha256:44bb8ff420c1d19d91d79d8c3574b8954288bdff0273bf788954064d260d7ab0", "sha256:4688c1e42968ba52e57d8670ad2306fe92e0169c6f3af0089be75bbac0c64a3b", @@ -95,6 +103,7 @@ "sha256:56d027eace784738457437df7331965473f2c0da2c70e1a1f6fdbae5402e0389", "sha256:5913a1177fc36e30fcf6dc868ce23b0453952c78c04c266d3149b3d39e1410d6", "sha256:5b6ef7d9f9c38292df3690fe3e302b5b530999fa90014853dcd0d6902fb59f26", + "sha256:5bf37a08493232fbb0f8229f1824b366c2fc1d02d64e7e918af40acd15f3e337", "sha256:5cb1e18167792d7d21e21365d7650b72d5081ed476123ff7b8cac7f45189c0c7", "sha256:61a7ee1f13ab913897dac7da44a73c6d44d48a4adff42a5701e3239791c96e14", "sha256:622a231b08899c864eb87e85f81c75e7b9ce05b001e59bbfbf43d4a71f5f32b2", @@ -102,6 +111,7 @@ "sha256:6b2ae9f5f67f89aade1fab0f7fd8f2832501311c363a21579d02defa844d9296", "sha256:6c772d6c0a79ac0f414a9f8947cc407e119b8598de7621f39cacadae3cf57d12", "sha256:6d847b14f7ea89f6ad3c9e3901d1bc4835f6b390a9c71df999b0162d9bb1e20f", + "sha256:73fd30d4ce0ea48010564ccee1a26bfe39323fde05cb34b5863455629db61dc7", "sha256:76ffebb907bec09ff511bb3acc077695e2c32bc2142819491579a695f77ffd4d", "sha256:7bbff90b63328013e1e8cb50650ae0b9bac54ffb4be6104378490193cd60f85a", "sha256:7cb81373984cc0e4682f31bc3d6be9026006d96eecd07ea49aafb06897746452", @@ -111,6 +121,7 @@ "sha256:87fdccbb6bb589095f413b1e05734ba492c962b4a45a13ff3408fa44ffe6479b", "sha256:88c63a1b55f352b02c6ffd24b15ead9fc0e8bf781dbe070213039324922a2eea", "sha256:8a674ac10e0a87b683f4fa2b6fa41090edfd686a6524bd8dedbd6138b309175c", + "sha256:8ed6a5b3d23ecc00ea02e1ed8e0ff9a08f4fc87a1f58a2530e71c0f48adf882f", "sha256:93130612b837103e15ac3f9cbacb4613f9e348b58b3aad53721d92e57f96d46a", "sha256:9744a863b489c79a73aba014df554b0e7a0fc44ef3f8a0ef2a52919c7d155031", "sha256:9749a124280a0ada4187a6cfd1ffd35c350fb3af79c706589d98e088c5044267", @@ -120,15 +131,24 @@ "sha256:9ed4c92a0665002ff8ea852353aeb60d9141eb04109e88928026d3c8a9e5433c", "sha256:a72661af47119a80d82fa583b554095308d6a4c356b2a554fdc2799bc19f2a43", "sha256:afde17ae04d90fbe53afb628f7f2d4ca022797aa093e809de5c3cf276f61bbfa", + "sha256:b1375b5d17d6145c798661b67e4ae9d5496920d9265e2f00f1c2c0b5ae91fbde", "sha256:b336c5e9cf03c7be40c47b5fd694c43c9f1358a80ba384a21969e0b4e66a9b17", + "sha256:b3523f51818e8f16599613edddb1ff924eeb4b53ab7e7197f85cbc321cdca32f", + "sha256:b43775532a5904bc938f9c15b77c613cb6ad6fb30990f3b0afaea82797a402d8", "sha256:b663f1e02de5d0573610756398e44c130add0eb9a3fc912a09665332942a2efb", "sha256:b83bb06a0192cccf1eb8d0a28672a1b79c74c3a8a5f2619625aeb6f28b3a82bb", + "sha256:ba72d37e2a924717990f4d7482e8ac88e2ef43fb95491eb6e0d124d77d2a150d", "sha256:c2415d9d082152460f2bd4e382a1e85aed233abc92db5a3880da2257dc7daf7b", "sha256:c83aa123d56f2e060644427a882a36b3c12db93727ad7a7b9efd7d7f3e9cc2c4", + "sha256:c8e521a0ce7cf690ca84b8cc2272ddaf9d8a50294fd086da67e517439614c755", + "sha256:cab1b5964b39607a66adbba01f1c12df2e55ac36c81ec6ed44f2fca44178bf1a", + "sha256:cb02ed34557afde2d2da68194d12f5719ee96cfb2eacc886352cb73e3808fc5d", + "sha256:cc0283a406774f465fb45ec7efb66857c09ffefbe49ec20b7882eff6d3c86d3a", "sha256:cfc391f4429ee0a9370aa93d812a52e1fee0f37a81861f4fdd1f4fb28e8547c3", "sha256:db844eb158a87ccab83e868a762ea8024ae27337fc7ddcbfcddd157f841fdfe7", "sha256:defed7ea5f218a9f2336301e6fd379f55c655bea65ba2476346340a0ce6f74a1", "sha256:e16eb9541f3dd1a3e92b89005e37b1257b157b7256df0e36bd7b33b50be73bcb", + "sha256:e1abbeef02962596548382e393f56e4c94acd286bd0c5afba756cffc33670e8a", "sha256:e23281b9a08ec338469268f98f194658abfb13658ee98e2b7f85ee9dd06caa91", "sha256:e2d9e1cbc1b25e22000328702b014227737756f4b5bf5c485ac1d8091ada078b", "sha256:e48f4234f2469ed012a98f4b7874e7f7e173c167bed4934912a29e03167cf6b1", @@ -249,10 +269,10 @@ }, "cloudscraper": { "hashes": [ - "sha256:152fa9f9db5f19f4ada7e75623e93f45d05bfd3fb29d9cae84f29173a2591530", - "sha256:59d964acded1a63336b3ce4daf3f2dfed3de7c88f6bf4d904c661b0b4e1b5f5e" + "sha256:5f0cde23774270e8a092de68e0fbd68e17854c767fc2d4042a91bda9e4816871", + "sha256:ec30da6cee60d0a95e898d9b3aaf09291a0d8b6cf751e86c6f3420b699a00091" ], - "version": "==1.2.64" + "version": "==1.2.66" }, "commonmark": { "hashes": [ @@ -263,35 +283,35 @@ }, "cryptography": { "hashes": [ - "sha256:068147f32fa662c81aebab95c74679b401b12b57494872886eb5c1139250ec5d", - "sha256:06fc3cc7b6f6cca87bd56ec80a580c88f1da5306f505876a71c8cfa7050257dd", - "sha256:25c1d1f19729fb09d42e06b4bf9895212292cb27bb50229f5aa64d039ab29146", - "sha256:402852a0aea73833d982cabb6d0c3bb582c15483d29fb7085ef2c42bfa7e38d7", - "sha256:4e269dcd9b102c5a3d72be3c45d8ce20377b8076a43cbed6f660a1afe365e436", - "sha256:5419a127426084933076132d317911e3c6eb77568a1ce23c3ac1e12d111e61e0", - "sha256:554bec92ee7d1e9d10ded2f7e92a5d70c1f74ba9524947c0ba0c850c7b011828", - "sha256:5e89468fbd2fcd733b5899333bc54d0d06c80e04cd23d8c6f3e0542358c6060b", - "sha256:65535bc550b70bd6271984d9863a37741352b4aad6fb1b3344a54e6950249b55", - "sha256:6ab9516b85bebe7aa83f309bacc5f44a61eeb90d0b4ec125d2d003ce41932d36", - "sha256:6addc3b6d593cd980989261dc1cce38263c76954d758c3c94de51f1e010c9a50", - "sha256:728f2694fa743a996d7784a6194da430f197d5c58e2f4e278612b359f455e4a2", - "sha256:785e4056b5a8b28f05a533fab69febf5004458e20dad7e2e13a3120d8ecec75a", - "sha256:78cf5eefac2b52c10398a42765bfa981ce2372cbc0457e6bf9658f41ec3c41d8", - "sha256:7f836217000342d448e1c9a342e9163149e45d5b5eca76a30e84503a5a96cab0", - "sha256:8d41a46251bf0634e21fac50ffd643216ccecfaf3701a063257fe0b2be1b6548", - "sha256:984fe150f350a3c91e84de405fe49e688aa6092b3525f407a18b9646f6612320", - "sha256:9b24bcff7853ed18a63cfb0c2b008936a9554af24af2fb146e16d8e1aed75748", - "sha256:b1b35d9d3a65542ed2e9d90115dfd16bbc027b3f07ee3304fc83580f26e43249", - "sha256:b1b52c9e5f8aa2b802d48bd693190341fae201ea51c7a167d69fc48b60e8a959", - "sha256:bbf203f1a814007ce24bd4d51362991d5cb90ba0c177a9c08825f2cc304d871f", - "sha256:be243c7e2bfcf6cc4cb350c0d5cdf15ca6383bbcb2a8ef51d3c9411a9d4386f0", - "sha256:bfbe6ee19615b07a98b1d2287d6a6073f734735b49ee45b11324d85efc4d5cbd", - "sha256:c46837ea467ed1efea562bbeb543994c2d1f6e800785bd5a2c98bc096f5cb220", - "sha256:dfb4f4dd568de1b6af9f4cda334adf7d72cf5bc052516e1b2608b683375dd95c", - "sha256:ed7b00096790213e09eb11c97cc6e2b757f15f3d2f85833cd2d3ec3fe37c1722" + "sha256:0e70da4bdff7601b0ef48e6348339e490ebfb0cbe638e083c9c41fb49f00c8bd", + "sha256:10652dd7282de17990b88679cb82f832752c4e8237f0c714be518044269415db", + "sha256:175c1a818b87c9ac80bb7377f5520b7f31b3ef2a0004e2420319beadedb67290", + "sha256:1d7e632804a248103b60b16fb145e8df0bc60eed790ece0d12efe8cd3f3e7744", + "sha256:1f13ddda26a04c06eb57119caf27a524ccae20533729f4b1e4a69b54e07035eb", + "sha256:2ec2a8714dd005949d4019195d72abed84198d877112abb5a27740e217e0ea8d", + "sha256:2fa36a7b2cc0998a3a4d5af26ccb6273f3df133d61da2ba13b3286261e7efb70", + "sha256:2fb481682873035600b5502f0015b664abc26466153fab5c6bc92c1ea69d478b", + "sha256:3178d46f363d4549b9a76264f41c6948752183b3f587666aff0555ac50fd7876", + "sha256:4367da5705922cf7070462e964f66e4ac24162e22ab0a2e9d31f1b270dd78083", + "sha256:4eb85075437f0b1fd8cd66c688469a0c4119e0ba855e3fef86691971b887caf6", + "sha256:50a1494ed0c3f5b4d07650a68cd6ca62efe8b596ce743a5c94403e6f11bf06c1", + "sha256:53049f3379ef05182864d13bb9686657659407148f901f3f1eee57a733fb4b00", + "sha256:6391e59ebe7c62d9902c24a4d8bcbc79a68e7c4ab65863536127c8a9cd94043b", + "sha256:67461b5ebca2e4c2ab991733f8ab637a7265bb582f07c7c88914b5afb88cb95b", + "sha256:78e47e28ddc4ace41dd38c42e6feecfdadf9c3be2af389abbfeef1ff06822285", + "sha256:80ca53981ceeb3241998443c4964a387771588c4e4a5d92735a493af868294f9", + "sha256:8a4b2bdb68a447fadebfd7d24855758fe2d6fecc7fed0b78d190b1af39a8e3b0", + "sha256:8e45653fb97eb2f20b8c96f9cd2b3a0654d742b47d638cf2897afbd97f80fa6d", + "sha256:998cd19189d8a747b226d24c0207fdaa1e6658a1d3f2494541cb9dfbf7dcb6d2", + "sha256:a10498349d4c8eab7357a8f9aa3463791292845b79597ad1b98a543686fb1ec8", + "sha256:b4cad0cea995af760f82820ab4ca54e5471fc782f70a007f31531957f43e9dee", + "sha256:bfe6472507986613dc6cc00b3d492b2f7564b02b3b3682d25ca7f40fa3fd321b", + "sha256:c9e0d79ee4c56d841bd4ac6e7697c8ff3c8d6da67379057f29e66acffcd1e9a7", + "sha256:ca57eb3ddaccd1112c18fc80abe41db443cc2e9dcb1917078e02dfa010a4f353", + "sha256:ce127dd0a6a0811c251a6cddd014d292728484e530d80e872ad9806cfb1c5b3c" ], "markers": "python_version >= '3.6'", - "version": "==38.0.3" + "version": "==38.0.4" }, "dataclasses-json": { "hashes": [ @@ -303,19 +323,19 @@ }, "dateparser": { "hashes": [ - "sha256:711f7eef6d431225bec56c00e386af3f6a47083276253375bdae1ae6c8d23d4a", - "sha256:ae7a7de30f26983d09fff802c1f9d35d54e1c11d7ab52ae904a1f3fc037ecba5" + "sha256:4431159799b63d8acec5d7d844c5e06edf3d1b0eb2bda6d4cac87134ddddd01c", + "sha256:73ec6e44a133c54076ecf9f9dc0fbe3dd4831f154f977ff06f53114d57c5425e" ], "index": "pypi", - "version": "==1.1.3" + "version": "==1.1.4" }, "exceptiongroup": { "hashes": [ - "sha256:2ac84b496be68464a2da60da518af3785fff8b7ec0d090a581604bc870bdee41", - "sha256:affbabf13fb6e98988c38d9c5650e701569fe3c1de3233cfb61c5f33774690ad" + "sha256:542adf9dea4055530d6e1279602fa5cb11dab2395fa650b8674eaec35fc4a828", + "sha256:bd14967b79cd9bdb54d97323216f8fdf533e278df937aa2a90089e7d6e06e5ec" ], "markers": "python_version < '3.11'", - "version": "==1.0.0" + "version": "==1.0.4" }, "ffmpeg-python": { "hashes": [ @@ -327,11 +347,11 @@ }, "filelock": { "hashes": [ - "sha256:55447caa666f2198c5b6b13a26d2084d26fa5b115c00d065664b2124680c4edc", - "sha256:617eb4e5eedc82fc5f47b6d61e4d11cb837c56cb4544e39081099fa17ad109d4" + "sha256:7565f628ea56bfcd8e54e42bdc55da899c85c1abfe1b5bcfd147e9188cebb3b2", + "sha256:8df285554452285f79c035efb0c861eb33a4bcfa5b7a137016e32e6a90f9792c" ], "markers": "python_version >= '3.7'", - "version": "==3.8.0" + "version": "==3.8.2" }, "flask": { "hashes": [ @@ -350,27 +370,27 @@ }, "google-api-core": { "hashes": [ - "sha256:10c06f7739fe57781f87523375e8e1a3a4674bf6392cd6131a3222182b971320", - "sha256:34f24bd1d5f72a8c4519773d99ca6bf080a6c4e041b4e9f024fe230191dda62e" + "sha256:4b9bb5d5a380a0befa0573b302651b8a9a89262c1730e37bf423cec511804c22", + "sha256:ce222e27b0de0d7bc63eb043b956996d6dccab14cc3b690aaea91c9cc99dc16e" ], "markers": "python_version >= '3.7'", - "version": "==2.10.2" + "version": "==2.11.0" }, "google-api-python-client": { "hashes": [ - "sha256:2c6611530308b3f931dcf1360713aa3a20cf465d0bf2bac65f2ec99e8c9860de", - "sha256:b8a0ca8454ad57bc65199044717d3d214197ae1e2d666426bbcd4021b36762e0" + "sha256:03624a28b5ba94f3c3d44761081f5dbf8cabaa20c5c3a96c046457c5713efb9b", + "sha256:bc2447a7479006d98927fb20faa74d892d3758ff68e99b621367632bc42b8af8" ], "index": "pypi", - "version": "==2.65.0" + "version": "==2.69.0" }, "google-auth": { "hashes": [ - "sha256:1ad5b0e6eba5f69645971abb3d2c197537d5914070a8c6d30299dfdb07c5c700", - "sha256:cf24817855d874ede2efd071aa22125445f555de1685b739a9782fcf408c2a3d" + "sha256:6897b93556d8d807ad70701bb89f000183aea366ca7ed94680828b37437a4994", + "sha256:72f12a6cfc968d754d7bdab369c5c5c16032106e52d32c6dfd8484e4c01a6d1f" ], "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4, 3.5'", - "version": "==2.14.0" + "version": "==2.15.0" }, "google-auth-httplib2": { "hashes": [ @@ -382,27 +402,27 @@ }, "google-auth-oauthlib": { "hashes": [ - "sha256:53019edbde83e08ff0740eefc5bded7e26a289941d12e7ae1f0f5bacf2faa031", - "sha256:db11bce4b3effc99b518ec22a2903470e0853c0c92be57694e3684e738d22513" + "sha256:40cc612a13c3336d5433e94e2adb42a0c88f6feb6c55769e44500fc70043a576", + "sha256:81056a310fb1c4a3e5a7e1a443e1eb96593c6bbc55b26c0261e4d3295d3e6593" ], "index": "pypi", - "version": "==0.7.0" + "version": "==0.8.0" }, "googleapis-common-protos": { "hashes": [ - "sha256:8eb2cbc91b69feaf23e32452a7ae60e791e09967d81d4fcc7fc388182d1bd394", - "sha256:c25873c47279387cfdcbdafa36149887901d36202cb645a0e4f29686bf6e4417" + "sha256:27a849d6205838fb6cc3c1c21cb9800707a661bb21c6ce7fb13e99eb1f8a0c46", + "sha256:a9f4a1d7f6d9809657b7f1316a1aa527f6664891531bcfcc13b6696e685f443c" ], "markers": "python_version >= '3.7'", - "version": "==1.56.4" + "version": "==1.57.0" }, "gspread": { "hashes": [ - "sha256:41f7a416425f1ec5a1b677f49b8fbf599102766c27ed7be6601a58c9a1550ebc", - "sha256:d3bbff4b7aad0fc2c986458e148537a02fe7b46e7162f41f3a42392bfa2adb89" + "sha256:ce76f9c16b88ccb792350142224a59afa8e69f7463f3d3417148cbe892efc7cb", + "sha256:dbeedd08c6a7f7b0bfc1a54e17c29205362250c77bf98e11125c5d99fd7f4ba7" ], "index": "pypi", - "version": "==5.6.2" + "version": "==5.7.2" }, "h11": { "hashes": [ @@ -591,11 +611,11 @@ }, "marshmallow": { "hashes": [ - "sha256:35e02a3a06899c9119b785c12a22f4cda361745d66a71ab691fd7610202ae104", - "sha256:6804c16114f7fce1f5b4dadc31f4674af23317fcc7f075da21e35c1a35d781f7" + "sha256:90032c0fd650ce94b6ec6dc8dfeb0e3ff50c144586462c389b81a07205bedb78", + "sha256:93f0958568da045b0021ec6aeb7ac37c81bfcccbb9a0e7ed8559885070b3a19b" ], "markers": "python_version >= '3.7'", - "version": "==3.18.0" + "version": "==3.19.0" }, "marshmallow-enum": { "hashes": [ @@ -645,31 +665,31 @@ }, "packaging": { "hashes": [ - "sha256:dd47c42927d89ab911e606518907cc2d3a1f38bbd026385970643f9c5b8ecfeb", - "sha256:ef103e05f519cdc783ae24ea4e2e0f508a9c99b2d4969652eed6a2e1ea5bd522" + "sha256:2198ec20bd4c017b8f9717e00f0c8714076fc2fd93816750ab48e2c41de2cfd3", + "sha256:957e2148ba0e1a3b282772e791ef1d8083648bc131c8ab0c1feba110ce1146c3" ], - "markers": "python_version >= '3.6'", - "version": "==21.3" + "markers": "python_version >= '3.7'", + "version": "==22.0" }, "protobuf": { "hashes": [ - "sha256:2c9c2ed7466ad565f18668aa4731c535511c5d9a40c6da39524bccf43e441719", - "sha256:48e2cd6b88c6ed3d5877a3ea40df79d08374088e89bedc32557348848dff250b", - "sha256:5b0834e61fb38f34ba8840d7dcb2e5a2f03de0c714e0293b3963b79db26de8ce", - "sha256:61f21493d96d2a77f9ca84fefa105872550ab5ef71d21c458eb80edcf4885a99", - "sha256:6e0be9f09bf9b6cf497b27425487706fa48c6d1632ddd94dab1a5fe11a422392", - "sha256:6e312e280fbe3c74ea9e080d9e6080b636798b5e3939242298b591064470b06b", - "sha256:7eb8f2cc41a34e9c956c256e3ac766cf4e1a4c9c925dc757a41a01be3e852965", - "sha256:84ea107016244dfc1eecae7684f7ce13c788b9a644cd3fca5b77871366556444", - "sha256:9227c14010acd9ae7702d6467b4625b6fe853175a6b150e539b21d2b2f2b409c", - "sha256:a419cc95fca8694804709b8c4f2326266d29659b126a93befe210f5bbc772536", - "sha256:a7d0ea43949d45b836234f4ebb5ba0b22e7432d065394b532cdca8f98415e3cf", - "sha256:b5ab0b8918c136345ff045d4b3d5f719b505b7c8af45092d7f45e304f55e50a1", - "sha256:e575c57dc8b5b2b2caa436c16d44ef6981f2235eb7179bfc847557886376d740", - "sha256:f9eae277dd240ae19bb06ff4e2346e771252b0e619421965504bd1b1bba7c5fa" + "sha256:25266bf373ee06d5d66f9eb1ec9d434b243dccce5c32faf151054cfa6f9dcbf1", + "sha256:260e346927fd4e6fbb49ab545137b19610c24a1d853dc5f29ddf777ab1987211", + "sha256:2c6a4d13732d9b094db31b3841986c38b17ac61a3fe05ee26a779d94c4c3fb43", + "sha256:4922e3320ed70e81f05060822da36923d09fd9e04e17f411f2d8d8d0070f9f5c", + "sha256:4b75c947289a2e9c1f37d21c593f1ef6fb4fed33977dfb2ac84f799eb29a8ff4", + "sha256:4d01ef83517c181d60ea1c6d0b2f644be250ade740d6554a2f5a021b1ad622e3", + "sha256:553e35c0878f6855e55f01a14561e6bce6df79b6636a5acf83b9d9ac7eab7922", + "sha256:85ccb4753ee21de7dc81a7a68a051f25dbe133ffa01a639ac998427d0b223387", + "sha256:a5a14b907a191319e7a58b38c583bbf50deb21e002f723a912c5e4f6969a778e", + "sha256:a944dc9550baae276afc7dc8193191d4c2ad660270a1e5ed5a71539817ebe2e2", + "sha256:bab4b21a986ded225b9392c07ce21c35d790951f51e1ebfd32e4d443b05c3726", + "sha256:c3b9e329b4c247dc3ba5c50f60915a84e08278eb6d9e3fa674d0d04ff816bfd7", + "sha256:d91a47c77b33580024b0271b65bb820c4e0264c25eb49151ad01e691de8fa0b6", + "sha256:efb16b16fd3eef25357f84d516062753014b76279ce4e0ec4880badd2fba7370" ], "markers": "python_version >= '3.7'", - "version": "==4.21.9" + "version": "==4.21.11" }, "pyaes": { "hashes": [ @@ -722,39 +742,35 @@ }, "pycryptodomex": { "hashes": [ - "sha256:04cc393045a8f19dd110c975e30f38ed7ab3faf21ede415ea67afebd95a22380", - "sha256:0776bfaf2c48154ab54ea45392847c1283d2fcf64e232e85565f858baedfc1fa", - "sha256:0fadb9f7fa3150577800eef35f62a8a24b9ddf1563ff060d9bd3af22d3952c8c", - "sha256:18e2ab4813883ae63396c0ffe50b13554b32bb69ec56f0afaf052e7a7ae0d55b", - "sha256:191e73bc84a8064ad1874dba0ebadedd7cce4dedee998549518f2c74a003b2e1", - "sha256:35a8f7afe1867118330e2e0e0bf759c409e28557fb1fc2fbb1c6c937297dbe9a", - "sha256:3709f13ca3852b0b07fc04a2c03b379189232b24007c466be0f605dd4723e9d4", - "sha256:4540904c09704b6f831059c0dfb38584acb82cb97b0125cd52688c1f1e3fffa6", - "sha256:463119d7d22d0fc04a0f9122e9d3e6121c6648bcb12a052b51bd1eed1b996aa2", - "sha256:46b3f05f2f7ac7841053da4e0f69616929ca3c42f238c405f6c3df7759ad2780", - "sha256:48697790203909fab02a33226fda546604f4e2653f9d47bc5d3eb40879fa7c64", - "sha256:5676a132169a1c1a3712edf25250722ebc8c9102aa9abd814df063ca8362454f", - "sha256:65204412d0c6a8e3c41e21e93a5e6054a74fea501afa03046a388cf042e3377a", - "sha256:67e1e6a92151023ccdfcfbc0afb3314ad30080793b4c27956ea06ab1fb9bcd8a", - "sha256:6f5b6ba8aefd624834bc177a2ac292734996bb030f9d1b388e7504103b6fcddf", - "sha256:7341f1bb2dadb0d1a0047f34c3a58208a92423cdbd3244d998e4b28df5eac0ed", - "sha256:78d9621cf0ea35abf2d38fa2ca6d0634eab6c991a78373498ab149953787e5e5", - "sha256:8eecdf9cdc7343001d047f951b9cc805cd68cb6cd77b20ea46af5bffc5bd3dfb", - "sha256:94c7b60e1f52e1a87715571327baea0733708ab4723346598beca4a3b6879794", - "sha256:996e1ba717077ce1e6d4849af7a1426f38b07b3d173b879e27d5e26d2e958beb", - "sha256:a07a64709e366c2041cd5cfbca592b43998bf4df88f7b0ca73dca37071ccf1bd", - "sha256:b6306403228edde6e289f626a3908a2f7f67c344e712cf7c0a508bab3ad9e381", - "sha256:b9279adc16e4b0f590ceff581f53a80179b02cba9056010d733eb4196134a870", - "sha256:c4cb9cb492ea7dcdf222a8d19a1d09002798ea516aeae8877245206d27326d86", - "sha256:dd452a5af7014e866206d41751886c9b4bf379a339fdf2dbfc7dd16c0fb4f8e0", - "sha256:e2b12968522a0358b8917fc7b28865acac002f02f4c4c6020fcb264d76bfd06d", - "sha256:e3164a18348bd53c69b4435ebfb4ac8a4076291ffa2a70b54f0c4b80c7834b1d", - "sha256:e47bf8776a7e15576887f04314f5228c6527b99946e6638cf2f16da56d260cab", - "sha256:f8be976cec59b11f011f790b88aca67b4ea2bd286578d0bd3e31bcd19afcd3e4", - "sha256:fc9bc7a9b79fe5c750fc81a307052f8daabb709bdaabb0fb18fb136b66b653b5" + "sha256:04610536921c1ec7adba158ef570348550c9f3a40bc24be9f8da2ef7ab387981", + "sha256:0ba28aa97cdd3ff5ed1a4f2b7f5cd04e721166bd75bd2b929e2734433882b583", + "sha256:0da835af786fdd1c9930994c78b23e88d816dc3f99aa977284a21bbc26d19735", + "sha256:1619087fb5b31510b0b0b058a54f001a5ffd91e6ffee220d9913064519c6a69d", + "sha256:1cda60207be8c1cf0b84b9138f9e3ca29335013d2b690774a5e94678ff29659a", + "sha256:22aed0868622d95179217c298e37ed7410025c7b29dac236d3230617d1e4ed56", + "sha256:231dc8008cbdd1ae0e34645d4523da2dbc7a88c325f0d4a59635a86ee25b41dd", + "sha256:2ad9bb86b355b6104796567dd44c215b3dc953ef2fae5e0bdfb8516731df92cf", + "sha256:4dbbe18cc232b5980c7633972ae5417d0df76fe89e7db246eefd17ef4d8e6d7a", + "sha256:6a465e4f856d2a4f2a311807030c89166529ccf7ccc65bef398de045d49144b6", + "sha256:70288d9bfe16b2fd0d20b6c365db614428f1bcde7b20d56e74cf88ade905d9eb", + "sha256:7993d26dae4d83b8f4ce605bb0aecb8bee330bb3c95475ef06f3694403621e71", + "sha256:8851585ff19871e5d69e1790f4ca5f6fd1699d6b8b14413b472a4c0dbc7ea780", + "sha256:893f8a97d533c66cc3a56e60dd3ed40a3494ddb4aafa7e026429a08772f8a849", + "sha256:8dd2d9e3c617d0712ed781a77efd84ea579e76c5f9b2a4bc0b684ebeddf868b2", + "sha256:a1c0ae7123448ecb034c75c713189cb00ebe2d415b11682865b6c54d200d9c93", + "sha256:b0789a8490114a2936ed77c87792cfe77582c829cb43a6d86ede0f9624ba8aa3", + "sha256:b3d04c00d777c36972b539fb79958790126847d84ec0129fce1efef250bfe3ce", + "sha256:ba57ac7861fd2c837cdb33daf822f2a052ff57dd769a2107807f52a36d0e8d38", + "sha256:ce338a9703f54b2305a408fc9890eb966b727ce72b69f225898bb4e9d9ed3f1f", + "sha256:daa67f5ebb6fbf1ee9c90decaa06ca7fc88a548864e5e484d52b0920a57fe8a5", + "sha256:e2453162f473c1eae4826eb10cd7bce19b5facac86d17fb5f29a570fde145abd", + "sha256:e25a2f5667d91795f9417cb856f6df724ccdb0cdd5cbadb212ee9bf43946e9f8", + "sha256:e5a670919076b71522c7d567a9043f66f14b202414a63c3a078b5831ae342c03", + "sha256:e9ba9d8ed638733c9e95664470b71d624a6def149e2db6cc52c1aca5a6a2df1d", + "sha256:f2b971a7b877348a27dcfd0e772a0343fb818df00b74078e91c008632284137d" ], "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'", - "version": "==3.15.0" + "version": "==3.16.0" }, "pygments": { "hashes": [ @@ -798,19 +814,19 @@ }, "python-slugify": { "hashes": [ - "sha256:272d106cb31ab99b3496ba085e3fea0e9e76dcde967b5e9992500d1f785ce4e1", - "sha256:7b2c274c308b62f4269a9ba701aa69a797e9bca41aeee5b3a9e79e36b6656927" + "sha256:003aee64f9fd955d111549f96c4b58a3f40b9319383c70fad6277a4974bbf570", + "sha256:7a0f21a39fa6c1c4bf2e5984c9b9ae944483fd10b54804cb0e23a3ccd4954f0b" ], "index": "pypi", - "version": "==6.1.2" + "version": "==7.0.0" }, "python-twitter-v2": { "hashes": [ - "sha256:04349e74ec6ebaa3c71d02dc82610acd3b6b346a0060adf4bad2379fd3f46701", - "sha256:1b17b3243108a7d8d1af0b71a3e87f28d105b5fe61cfd09944e28a7903769c81" + "sha256:18c14853da8b499775a11a3f5e1d0692a7017fa41eca91ac5afa73f35b935a90", + "sha256:fbe582ae7c6b33f6055b97e23dd106874e6650091d257fe67bfd024b96ebf8d6" ], "index": "pypi", - "version": "==0.7.9" + "version": "==0.8.0" }, "pytz": { "hashes": [ @@ -875,83 +891,97 @@ }, "regex": { "hashes": [ - "sha256:0008650041531d0eadecc96a73d37c2dc4821cf51b0766e374cb4f1ddc4e1c14", - "sha256:03299b0bcaa7824eb7c0ebd7ef1e3663302d1b533653bfe9dc7e595d453e2ae9", - "sha256:06b1df01cf2aef3a9790858af524ae2588762c8a90e784ba00d003f045306204", - "sha256:09b4b6ccc61d4119342b26246ddd5a04accdeebe36bdfe865ad87a0784efd77f", - "sha256:0be0c34a39e5d04a62fd5342f0886d0e57592a4f4993b3f9d257c1f688b19737", - "sha256:0d96eec8550fd2fd26f8e675f6d8b61b159482ad8ffa26991b894ed5ee19038b", - "sha256:0eb0e2845e81bdea92b8281a3969632686502565abf4a0b9e4ab1471c863d8f3", - "sha256:13bbf0c9453c6d16e5867bda7f6c0c7cff1decf96c5498318bb87f8136d2abd4", - "sha256:17e51ad1e6131c496b58d317bc9abec71f44eb1957d32629d06013a21bc99cac", - "sha256:1977bb64264815d3ef016625adc9df90e6d0e27e76260280c63eca993e3f455f", - "sha256:1e30762ddddb22f7f14c4f59c34d3addabc789216d813b0f3e2788d7bcf0cf29", - "sha256:1e73652057473ad3e6934944af090852a02590c349357b79182c1b681da2c772", - "sha256:20e6a27959f162f979165e496add0d7d56d7038237092d1aba20b46de79158f1", - "sha256:286ff9ec2709d56ae7517040be0d6c502642517ce9937ab6d89b1e7d0904f863", - "sha256:297c42ede2c81f0cb6f34ea60b5cf6dc965d97fa6936c11fc3286019231f0d66", - "sha256:320c2f4106962ecea0f33d8d31b985d3c185757c49c1fb735501515f963715ed", - "sha256:35ed2f3c918a00b109157428abfc4e8d1ffabc37c8f9abc5939ebd1e95dabc47", - "sha256:3d146e5591cb67c5e836229a04723a30af795ef9b70a0bbd913572e14b7b940f", - "sha256:42bb37e2b2d25d958c25903f6125a41aaaa1ed49ca62c103331f24b8a459142f", - "sha256:42d6007722d46bd2c95cce700181570b56edc0dcbadbfe7855ec26c3f2d7e008", - "sha256:43eba5c46208deedec833663201752e865feddc840433285fbadee07b84b464d", - "sha256:452519bc4c973e961b1620c815ea6dd8944a12d68e71002be5a7aff0a8361571", - "sha256:4b9c16a807b17b17c4fa3a1d8c242467237be67ba92ad24ff51425329e7ae3d0", - "sha256:5510932596a0f33399b7fff1bd61c59c977f2b8ee987b36539ba97eb3513584a", - "sha256:55820bc631684172b9b56a991d217ec7c2e580d956591dc2144985113980f5a3", - "sha256:57484d39447f94967e83e56db1b1108c68918c44ab519b8ecfc34b790ca52bf7", - "sha256:58ba41e462653eaf68fc4a84ec4d350b26a98d030be1ab24aba1adcc78ffe447", - "sha256:5bc5f921be39ccb65fdda741e04b2555917a4bced24b4df14eddc7569be3b493", - "sha256:5dcc4168536c8f68654f014a3db49b6b4a26b226f735708be2054314ed4964f4", - "sha256:5f92a7cdc6a0ae2abd184e8dfd6ef2279989d24c85d2c85d0423206284103ede", - "sha256:67250b36edfa714ba62dc62d3f238e86db1065fccb538278804790f578253640", - "sha256:6df070a986fc064d865c381aecf0aaff914178fdf6874da2f2387e82d93cc5bd", - "sha256:729aa8ca624c42f309397c5fc9e21db90bf7e2fdd872461aabdbada33de9063c", - "sha256:72bc3a5effa5974be6d965ed8301ac1e869bc18425c8a8fac179fbe7876e3aee", - "sha256:74d86e8924835f863c34e646392ef39039405f6ce52956d8af16497af4064a30", - "sha256:79e5af1ff258bc0fe0bdd6f69bc4ae33935a898e3cbefbbccf22e88a27fa053b", - "sha256:7b103dffb9f6a47ed7ffdf352b78cfe058b1777617371226c1894e1be443afec", - "sha256:83f03f0bd88c12e63ca2d024adeee75234d69808b341e88343b0232329e1f1a1", - "sha256:86d7a68fa53688e1f612c3246044157117403c7ce19ebab7d02daf45bd63913e", - "sha256:878c626cbca3b649e14e972c14539a01191d79e58934e3f3ef4a9e17f90277f8", - "sha256:878f5d649ba1db9f52cc4ef491f7dba2d061cdc48dd444c54260eebc0b1729b9", - "sha256:87bc01226cd288f0bd9a4f9f07bf6827134dc97a96c22e2d28628e824c8de231", - "sha256:8babb2b5751105dc0aef2a2e539f4ba391e738c62038d8cb331c710f6b0f3da7", - "sha256:91e0f7e7be77250b808a5f46d90bf0032527d3c032b2131b63dee54753a4d729", - "sha256:9557545c10d52c845f270b665b52a6a972884725aa5cf12777374e18f2ea8960", - "sha256:9ccb0a4ab926016867260c24c192d9df9586e834f5db83dfa2c8fffb3a6e5056", - "sha256:9d828c5987d543d052b53c579a01a52d96b86f937b1777bbfe11ef2728929357", - "sha256:9efa41d1527b366c88f265a227b20bcec65bda879962e3fc8a2aee11e81266d7", - "sha256:aaf5317c961d93c1a200b9370fb1c6b6836cc7144fef3e5a951326912bf1f5a3", - "sha256:ab69b4fe09e296261377d209068d52402fb85ef89dc78a9ac4a29a895f4e24a7", - "sha256:ad397bc7d51d69cb07ef89e44243f971a04ce1dca9bf24c992c362406c0c6573", - "sha256:ae17fc8103f3b63345709d3e9654a274eee1c6072592aec32b026efd401931d0", - "sha256:af4d8cc28e4c7a2f6a9fed544228c567340f8258b6d7ea815b62a72817bbd178", - "sha256:b22ff939a8856a44f4822da38ef4868bd3a9ade22bb6d9062b36957c850e404f", - "sha256:b549d851f91a4efb3e65498bd4249b1447ab6035a9972f7fc215eb1f59328834", - "sha256:be319f4eb400ee567b722e9ea63d5b2bb31464e3cf1b016502e3ee2de4f86f5c", - "sha256:c0446b2871335d5a5e9fcf1462f954586b09a845832263db95059dcd01442015", - "sha256:c68d2c04f7701a418ec2e5631b7f3552efc32f6bcc1739369c6eeb1af55f62e0", - "sha256:c87ac58b9baaf50b6c1b81a18d20eda7e2883aa9a4fb4f1ca70f2e443bfcdc57", - "sha256:caa2734ada16a44ae57b229d45091f06e30a9a52ace76d7574546ab23008c635", - "sha256:cb34c2d66355fb70ae47b5595aafd7218e59bb9c00ad8cc3abd1406ca5874f07", - "sha256:cb3652bbe6720786b9137862205986f3ae54a09dec8499a995ed58292bdf77c2", - "sha256:cf668f26604e9f7aee9f8eaae4ca07a948168af90b96be97a4b7fa902a6d2ac1", - "sha256:d326ff80ed531bf2507cba93011c30fff2dd51454c85f55df0f59f2030b1687b", - "sha256:d6c2441538e4fadd4291c8420853431a229fcbefc1bf521810fbc2629d8ae8c2", - "sha256:d6ecfd1970b3380a569d7b3ecc5dd70dba295897418ed9e31ec3c16a5ab099a5", - "sha256:e5602a9b5074dcacc113bba4d2f011d2748f50e3201c8139ac5b68cf2a76bd8b", - "sha256:ef806f684f17dbd6263d72a54ad4073af42b42effa3eb42b877e750c24c76f86", - "sha256:f3356afbb301ec34a500b8ba8b47cba0b44ed4641c306e1dd981a08b416170b5", - "sha256:f6f7ee2289176cb1d2c59a24f50900f8b9580259fa9f1a739432242e7d254f93", - "sha256:f7e8f1ee28e0a05831c92dc1c0c1c94af5289963b7cf09eca5b5e3ce4f8c91b0", - "sha256:f8169ec628880bdbca67082a9196e2106060a4a5cbd486ac51881a4df805a36f", - "sha256:fbc88d3ba402b5d041d204ec2449c4078898f89c4a6e6f0ed1c1a510ef1e221d", - "sha256:fbd3fe37353c62fd0eb19fb76f78aa693716262bcd5f9c14bb9e5aca4b3f0dc4" + "sha256:052b670fafbe30966bbe5d025e90b2a491f85dfe5b2583a163b5e60a85a321ad", + "sha256:0653d012b3bf45f194e5e6a41df9258811ac8fc395579fa82958a8b76286bea4", + "sha256:0a069c8483466806ab94ea9068c34b200b8bfc66b6762f45a831c4baaa9e8cdd", + "sha256:0cf0da36a212978be2c2e2e2d04bdff46f850108fccc1851332bcae51c8907cc", + "sha256:131d4be09bea7ce2577f9623e415cab287a3c8e0624f778c1d955ec7c281bd4d", + "sha256:144486e029793a733e43b2e37df16a16df4ceb62102636ff3db6033994711066", + "sha256:1ddf14031a3882f684b8642cb74eea3af93a2be68893901b2b387c5fd92a03ec", + "sha256:1eba476b1b242620c266edf6325b443a2e22b633217a9835a52d8da2b5c051f9", + "sha256:20f61c9944f0be2dc2b75689ba409938c14876c19d02f7585af4460b6a21403e", + "sha256:22960019a842777a9fa5134c2364efaed5fbf9610ddc5c904bd3a400973b0eb8", + "sha256:22e7ebc231d28393dfdc19b185d97e14a0f178bedd78e85aad660e93b646604e", + "sha256:23cbb932cc53a86ebde0fb72e7e645f9a5eec1a5af7aa9ce333e46286caef783", + "sha256:29c04741b9ae13d1e94cf93fca257730b97ce6ea64cfe1eba11cf9ac4e85afb6", + "sha256:2bde29cc44fa81c0a0c8686992c3080b37c488df167a371500b2a43ce9f026d1", + "sha256:2cdc55ca07b4e70dda898d2ab7150ecf17c990076d3acd7a5f3b25cb23a69f1c", + "sha256:370f6e97d02bf2dd20d7468ce4f38e173a124e769762d00beadec3bc2f4b3bc4", + "sha256:395161bbdbd04a8333b9ff9763a05e9ceb4fe210e3c7690f5e68cedd3d65d8e1", + "sha256:44136355e2f5e06bf6b23d337a75386371ba742ffa771440b85bed367c1318d1", + "sha256:44a6c2f6374e0033873e9ed577a54a3602b4f609867794c1a3ebba65e4c93ee7", + "sha256:4919899577ba37f505aaebdf6e7dc812d55e8f097331312db7f1aab18767cce8", + "sha256:4b4b1fe58cd102d75ef0552cf17242705ce0759f9695334a56644ad2d83903fe", + "sha256:4bdd56ee719a8f751cf5a593476a441c4e56c9b64dc1f0f30902858c4ef8771d", + "sha256:4bf41b8b0a80708f7e0384519795e80dcb44d7199a35d52c15cc674d10b3081b", + "sha256:4cac3405d8dda8bc6ed499557625585544dd5cbf32072dcc72b5a176cb1271c8", + "sha256:4fe7fda2fe7c8890d454f2cbc91d6c01baf206fbc96d89a80241a02985118c0c", + "sha256:50921c140561d3db2ab9f5b11c5184846cde686bb5a9dc64cae442926e86f3af", + "sha256:5217c25229b6a85049416a5c1e6451e9060a1edcf988641e309dbe3ab26d3e49", + "sha256:5352bea8a8f84b89d45ccc503f390a6be77917932b1c98c4cdc3565137acc714", + "sha256:542e3e306d1669b25936b64917285cdffcd4f5c6f0247636fec037187bd93542", + "sha256:543883e3496c8b6d58bd036c99486c3c8387c2fc01f7a342b760c1ea3158a318", + "sha256:586b36ebda81e6c1a9c5a5d0bfdc236399ba6595e1397842fd4a45648c30f35e", + "sha256:597f899f4ed42a38df7b0e46714880fb4e19a25c2f66e5c908805466721760f5", + "sha256:5a260758454580f11dd8743fa98319bb046037dfab4f7828008909d0aa5292bc", + "sha256:5aefb84a301327ad115e9d346c8e2760009131d9d4b4c6b213648d02e2abe144", + "sha256:5e6a5567078b3eaed93558842346c9d678e116ab0135e22eb72db8325e90b453", + "sha256:5ff525698de226c0ca743bfa71fc6b378cda2ddcf0d22d7c37b1cc925c9650a5", + "sha256:61edbca89aa3f5ef7ecac8c23d975fe7261c12665f1d90a6b1af527bba86ce61", + "sha256:659175b2144d199560d99a8d13b2228b85e6019b6e09e556209dfb8c37b78a11", + "sha256:6a9a19bea8495bb419dc5d38c4519567781cd8d571c72efc6aa959473d10221a", + "sha256:6b30bddd61d2a3261f025ad0f9ee2586988c6a00c780a2fb0a92cea2aa702c54", + "sha256:6ffd55b5aedc6f25fd8d9f905c9376ca44fcf768673ffb9d160dd6f409bfda73", + "sha256:702d8fc6f25bbf412ee706bd73019da5e44a8400861dfff7ff31eb5b4a1276dc", + "sha256:74bcab50a13960f2a610cdcd066e25f1fd59e23b69637c92ad470784a51b1347", + "sha256:75f591b2055523fc02a4bbe598aa867df9e953255f0b7f7715d2a36a9c30065c", + "sha256:763b64853b0a8f4f9cfb41a76a4a85a9bcda7fdda5cb057016e7706fde928e66", + "sha256:76c598ca73ec73a2f568e2a72ba46c3b6c8690ad9a07092b18e48ceb936e9f0c", + "sha256:78d680ef3e4d405f36f0d6d1ea54e740366f061645930072d39bca16a10d8c93", + "sha256:7b280948d00bd3973c1998f92e22aa3ecb76682e3a4255f33e1020bd32adf443", + "sha256:7db345956ecce0c99b97b042b4ca7326feeec6b75facd8390af73b18e2650ffc", + "sha256:7dbdce0c534bbf52274b94768b3498abdf675a691fec5f751b6057b3030f34c1", + "sha256:7ef6b5942e6bfc5706301a18a62300c60db9af7f6368042227ccb7eeb22d0892", + "sha256:7f5a3ffc731494f1a57bd91c47dc483a1e10048131ffb52d901bfe2beb6102e8", + "sha256:8a45b6514861916c429e6059a55cf7db74670eaed2052a648e3e4d04f070e001", + "sha256:8ad241da7fac963d7573cc67a064c57c58766b62a9a20c452ca1f21050868dfa", + "sha256:8b0886885f7323beea6f552c28bff62cbe0983b9fbb94126531693ea6c5ebb90", + "sha256:8ca88da1bd78990b536c4a7765f719803eb4f8f9971cc22d6ca965c10a7f2c4c", + "sha256:8e0caeff18b96ea90fc0eb6e3bdb2b10ab5b01a95128dfeccb64a7238decf5f0", + "sha256:957403a978e10fb3ca42572a23e6f7badff39aa1ce2f4ade68ee452dc6807692", + "sha256:9af69f6746120998cd9c355e9c3c6aec7dff70d47247188feb4f829502be8ab4", + "sha256:9c94f7cc91ab16b36ba5ce476f1904c91d6c92441f01cd61a8e2729442d6fcf5", + "sha256:a37d51fa9a00d265cf73f3de3930fa9c41548177ba4f0faf76e61d512c774690", + "sha256:a3a98921da9a1bf8457aeee6a551948a83601689e5ecdd736894ea9bbec77e83", + "sha256:a3c1ebd4ed8e76e886507c9eddb1a891673686c813adf889b864a17fafcf6d66", + "sha256:a5f9505efd574d1e5b4a76ac9dd92a12acb2b309551e9aa874c13c11caefbe4f", + "sha256:a8ff454ef0bb061e37df03557afda9d785c905dab15584860f982e88be73015f", + "sha256:a9d0b68ac1743964755ae2d89772c7e6fb0118acd4d0b7464eaf3921c6b49dd4", + "sha256:aa62a07ac93b7cb6b7d0389d8ef57ffc321d78f60c037b19dfa78d6b17c928ee", + "sha256:ac741bf78b9bb432e2d314439275235f41656e189856b11fb4e774d9f7246d81", + "sha256:ae1e96785696b543394a4e3f15f3f225d44f3c55dafe3f206493031419fedf95", + "sha256:b683e5fd7f74fb66e89a1ed16076dbab3f8e9f34c18b1979ded614fe10cdc4d9", + "sha256:b7a8b43ee64ca8f4befa2bea4083f7c52c92864d8518244bfa6e88c751fa8fff", + "sha256:b8e38472739028e5f2c3a4aded0ab7eadc447f0d84f310c7a8bb697ec417229e", + "sha256:bfff48c7bd23c6e2aec6454aaf6edc44444b229e94743b34bdcdda2e35126cf5", + "sha256:c14b63c9d7bab795d17392c7c1f9aaabbffd4cf4387725a0ac69109fb3b550c6", + "sha256:c27cc1e4b197092e50ddbf0118c788d9977f3f8f35bfbbd3e76c1846a3443df7", + "sha256:c28d3309ebd6d6b2cf82969b5179bed5fefe6142c70f354ece94324fa11bf6a1", + "sha256:c670f4773f2f6f1957ff8a3962c7dd12e4be54d05839b216cb7fd70b5a1df394", + "sha256:ce6910b56b700bea7be82c54ddf2e0ed792a577dfaa4a76b9af07d550af435c6", + "sha256:d0213671691e341f6849bf33cd9fad21f7b1cb88b89e024f33370733fec58742", + "sha256:d03fe67b2325cb3f09be029fd5da8df9e6974f0cde2c2ac6a79d2634e791dd57", + "sha256:d0e5af9a9effb88535a472e19169e09ce750c3d442fb222254a276d77808620b", + "sha256:d243b36fbf3d73c25e48014961e83c19c9cc92530516ce3c43050ea6276a2ab7", + "sha256:d26166acf62f731f50bdd885b04b38828436d74e8e362bfcb8df221d868b5d9b", + "sha256:d403d781b0e06d2922435ce3b8d2376579f0c217ae491e273bab8d092727d244", + "sha256:d8716f82502997b3d0895d1c64c3b834181b1eaca28f3f6336a71777e437c2af", + "sha256:e4f781ffedd17b0b834c8731b75cce2639d5a8afe961c1e58ee7f1f20b3af185", + "sha256:e613a98ead2005c4ce037c7b061f2409a1a4e45099edb0ef3200ee26ed2a69a8", + "sha256:ef4163770525257876f10e8ece1cf25b71468316f61451ded1a6f44273eedeb5" ], "markers": "python_version >= '3.6'", - "version": "==2022.3.2" + "version": "==2022.10.31" }, "requests": { "hashes": [ @@ -1003,10 +1033,11 @@ }, "selenium": { "hashes": [ - "sha256:a733dd77d3171b846893f4d51b18967d809313f547a10974e26579f9ce797462" + "sha256:06a1c7d9f313130b21c3218ddd8852070d0e7419afdd31f96160cd576555a5ce", + "sha256:3aefa14a28a42e520550c1cd0f29cf1d566328186ea63aa9a3e01fb265b5894d" ], "index": "pypi", - "version": "==4.5.0" + "version": "==4.7.2" }, "six": { "hashes": [ @@ -1049,11 +1080,11 @@ }, "telethon": { "hashes": [ - "sha256:8df802aad2d11f7198f1d5b1d84c7498ef19c28e160041dcb8aaf0814f91115b", - "sha256:a085348801bd62db79ad75c9a67c5c8312507b113f0228b92e2dd4397edc7c1d" + "sha256:148ac8c27908853d5d8a116d55ce947e9ba167bb697c75226ae95645b2e5a504", + "sha256:de7a1619110a2c06390fb5340839c6503c6b108b5f1a2f3bbe1ef60f02cecacb" ], "index": "pypi", - "version": "==1.25.4" + "version": "==1.26.0" }, "text-unidecode": { "hashes": [ @@ -1074,7 +1105,7 @@ "sha256:5f4f682a004951c1b450bc753c710e9280c5746ce6ffedee253ddbcbf54cf1e4", "sha256:6fee160d6ffcd1b1c68c65f14c829c22832bc401726335ce92c52d395944a6a1" ], - "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'", + "index": "pypi", "version": "==4.64.1" }, "trio": { @@ -1110,11 +1141,11 @@ }, "tzdata": { "hashes": [ - "sha256:04a680bdc5b15750c39c12a448885a51134a27ec9af83667663f0b3a1bf3f342", - "sha256:91f11db4503385928c15598c98573e3af07e7229181bee5375bd30f1695ddcae" + "sha256:2b88858b0e3120792a3c0635c23daf36a7d7eeeca657c323da299d2094402a0d", + "sha256:fe5f866eddd8b96e9fcba978f8e503c909b19ea7efda11e52e39494bad3a7bfa" ], "markers": "python_version >= '3.6'", - "version": "==2022.6" + "version": "==2022.7" }, "tzlocal": { "hashes": [ @@ -1149,11 +1180,11 @@ }, "vk-url-scraper": { "hashes": [ - "sha256:3718a569e431c9c2bc7e92e9156e25b7112dc0b9b461c8001fa481a00ccbd3bc", - "sha256:baebe32bb29d6f188d849f38ecc43d04d5b5bad05db7f31dfdbe450f684042f0" + "sha256:1cd6daad89a1f920902cb68c5952c5ab5e80ba2bf4a8c3657c781b5b0f9d406b", + "sha256:d430de947575e321cedceecfdf198b8bd14db3026038b924547e8b1c7c6a09ed" ], "index": "pypi", - "version": "==0.3.8" + "version": "==0.3.10" }, "websockets": { "hashes": [ @@ -1266,11 +1297,11 @@ }, "pycodestyle": { "hashes": [ - "sha256:2c9607871d58c76354b697b42f5d57e1ada7d261c261efac224b664affdc5785", - "sha256:d1735fc58b418fd7c5f658d28d943854f8a849b01a5d0a1e6f3f3fdd0166804b" + "sha256:347187bdb476329d98f695c213d7295a846d1152ff4fe9bacb8a9590b8ee7053", + "sha256:8a4eaf0d0495c7395bdab3589ac2db602797d76207242c17d470186815706610" ], "markers": "python_version >= '3.6'", - "version": "==2.9.1" + "version": "==2.10.0" }, "tomli": { "hashes": [ diff --git a/README.md b/README.md index 11ff002..7edc3fd 100644 --- a/README.md +++ b/README.md @@ -22,6 +22,12 @@ Use this to make sure you help making sure you did all the required steps: * [ ] (optional for instagram) `instaloader.session` file which appears after the 1st run and login in telegram * [ ] (optional for browsertrix) `profile.tar.gz` file +### Private telegram channels +* Cannot use bot token +* Should have one with bot token, one without +* Setup join all private invite links at the start +* + ## Setup ### Always required 1. [A Google Service account is necessary for use with `gspread`.](https://gspread.readthedocs.io/en/latest/oauth2.html#for-bots-using-service-account) Credentials for this account should be stored in `service_account.json`, in the same directory as the script. diff --git a/orchestrate.yaml b/orchestration.example.yaml similarity index 80% rename from orchestrate.yaml rename to orchestration.example.yaml index 9626e83..7163829 100644 --- a/orchestrate.yaml +++ b/orchestration.example.yaml @@ -8,8 +8,8 @@ steps: # - twitter # - instagram # - webarchive # this way it runs as a failsafe only - enrichers: - - screenshot + # enrichers: + # - screenshot # - wacz # - webarchive # this way it runs for every case, webarchive extends archiver and enrichment # - thumbnails @@ -29,11 +29,11 @@ configurations: global: - save_logs: False gsheets_feeder: - sheet: auto-archiver-test + sheet: my-auto-archiver header: 2 # defaults to 1 in GSheetsFeeder service_account: "secrets/service_account.json" - allow_worksheets: "aa-refactor-tests" - block_worksheets: "blocked,test-cases-008" + # allow_worksheets: "allowed" + # block_worksheets: "blocked1,blocked2" columns: 'url': 'link' 'status': 'archive status' @@ -49,6 +49,16 @@ configurations: 'hash': 'hash' 'wacz': 'wacz' 'replaywebpage': 'replaywebpage' + telethon: + api_id: "1234567" + api_hash: "examplehash" + session_file: "secrets/anon" + channel_invites: + - invite: https://t.me/+XXXXXXXXXXXXXX + id: 1000000000 + - invite: https://t.me/joinchat/XXXXXXXXXXXXXX + id: 1000000001 + tiktok: api_keys: - username: 1 @@ -60,7 +70,7 @@ configurations: token: "here" screenshot: width: 1280 - height: 720000 + height: 4600 wacz: profile: secrets/profile.tar.gz webarchive: diff --git a/src/archivers/__init__.py b/src/archivers/__init__.py index 7f51e39..a2cb67c 100644 --- a/src/archivers/__init__.py +++ b/src/archivers/__init__.py @@ -1,5 +1,6 @@ # we need to explicitly expose the available imports here from .base_archiver import Archiver, ArchiveResult +from .archiver import Archiverv2 from .telegram_archiver import TelegramArchiver from .telethon_archiver import TelethonArchiver from .tiktok_archiver import TiktokArchiver @@ -8,4 +9,6 @@ from .youtubedl_archiver import YoutubeDLArchiver from .twitter_archiver import TwitterArchiver from .vk_archiver import VkArchiver from .twitter_api_archiver import TwitterApiArchiver -from .instagram_archiver import InstagramArchiver \ No newline at end of file +from .instagram_archiver import InstagramArchiver + +from .telethon_archiverv2 import TelethonArchiver \ No newline at end of file diff --git a/src/archivers/archiver.py b/src/archivers/archiver.py new file mode 100644 index 0000000..538804e --- /dev/null +++ b/src/archivers/archiver.py @@ -0,0 +1,26 @@ +from __future__ import annotations +from abc import abstractmethod +from dataclasses import dataclass +from metadata import Metadata +from steps.step import Step + + +@dataclass +class Archiverv2(Step): + name = "archiver" + + def __init__(self, config: dict) -> None: + # without this STEP.__init__ is not called + super().__init__(config) + # self.setup() + + # only for typing... + def init(name: str, config: dict) -> Archiverv2: + return Step.init(name, config, Archiverv2) + + def setup(self) -> None: + # used when archivers need to login or do other one-time setup + pass + + @abstractmethod + def download(self, item: Metadata) -> Metadata: pass diff --git a/src/archivers/telethon_archiverv2.py b/src/archivers/telethon_archiverv2.py new file mode 100644 index 0000000..a4273b0 --- /dev/null +++ b/src/archivers/telethon_archiverv2.py @@ -0,0 +1,114 @@ +from archivers import Archiverv2 +from metadata import Metadata +from telethon.sync import TelegramClient +from telethon.errors import ChannelInvalidError +from telethon.tl.types import PeerUser, PeerChat, PeerChannel +from telethon.tl.functions.messages import ImportChatInviteRequest +from telethon.errors.rpcerrorlist import UserAlreadyParticipantError, FloodWaitError, InviteRequestSentError, InviteHashExpiredError +from loguru import logger +from tqdm import tqdm +import re, time, json + + + +class TelethonArchiver(Archiverv2): + name = "telethon" + link_pattern = re.compile(r"https:\/\/t\.me(\/c){0,1}\/(.+)\/(\d+)") + invite_pattern = re.compile(r"t.me(\/joinchat){0,1}\/\+?(.+)") + + def __init__(self, config: dict) -> None: + super().__init__(config) + assert self.api_id is not None and type(self.api_id) == str and len(self.api_id) > 0, f"invalid telethon api_id value ({self.api_id}) should be a valid string" + assert self.api_hash is not None and type(self.api_hash) == str and len(self.api_hash) > 0, f"invalid telethon api_hash value ({self.api_hash}) should be a valid string" + + self.client = TelegramClient(self.session_file, self.api_id, self.api_hash) + + @staticmethod + def configs() -> dict: + return { + "api_id": {"default": None, "help": "telegram API_ID value, go to https://my.telegram.org/apps"}, + "api_hash": {"default": None, "help": "telegram API_HASH value, go to https://my.telegram.org/apps"}, + # "bot_token": {"default": None, "help": "optional, but allows access to more content such as large videos, talk to @botfather"}, + "session_file": {"default": "secrets/anon", "help": "optional, records the telegram login session for future usage"}, + "channel_invites": { + "default": {}, + "help": "(JSON string) private channel invite links (format: t.me/joinchat/HASH OR t.me/+HASH) and (optional but important to avoid hanging for minutes on startup) channel id (format: CHANNEL_ID taken from a post url like https://t.me/c/CHANNEL_ID/1), the telegram account will join any new channels on setup", + "cli_set": lambda cli_val, cur_val: dict(cur_val, **json.loads(cli_val)) + } + } + + def setup(self) -> None: + logger.info(f"SETUP {self.name} checking login...") + with self.client.start(): pass + + if len(self.channel_invites): + logger.info(f"SETUP {self.name} joining channels...") + with self.client.start(): + # get currently joined channels + # https://docs.telethon.dev/en/stable/modules/custom.html#module-telethon.tl.custom.dialog + joined_channel_ids = [c.id for c in self.client.get_dialogs() if c.is_channel] + logger.info(f"already part of {len(joined_channel_ids)} channels") + + i = 0 + pbar = tqdm(desc=f"joining {len(self.channel_invites)} invite links", total=len(self.channel_invites)) + while i < len(self.channel_invites): + channel_invite = self.channel_invites[i] + channel_id = channel_invite.get("id", False) + invite = channel_invite["invite"] + if (match := self.invite_pattern.search(invite)): + try: + if channel_id: + ent = self.client.get_entity(int(channel_id)) # fails if not a member + else: + ent = self.client.get_entity(invite) # fails if not a member + logger.warning(f"please add the property id='{ent.id}' to the 'channel_invites' configuration where {invite=}, not doing so can lead to a minutes-long setup time due to telegram's rate limiting.") + except ValueError as e: + logger.info(f"joining new channel {invite=}") + try: + self.client(ImportChatInviteRequest(match.group(2))) + except UserAlreadyParticipantError as e: + logger.info(f"already joined {invite=}") + except InviteRequestSentError: + logger.warning(f"already sent a join request with {invite} still no answer") + except InviteHashExpiredError: + logger.warning(f"{invite=} has expired please find a more recent one") + except Exception as e: + logger.error(f"could not join channel with {invite=} due to {e}") + except FloodWaitError as e: + logger.warning(f"got a flood error, need to wait {e.seconds} seconds") + time.sleep(e.seconds) + continue + else: + logger.warning(f"Invalid invite link {invite}") + i+=1 + pbar.update() + + + def download(self, item: Metadata) -> Metadata: + url = self.get_url(item) + print(f"downloading {url=}") + # detect URLs that we definitely cannot handle + match = self.link_pattern.search(url) + if not match: return False + + # app will ask (stall for user input!) for phone number and auth code if anon.session not found + # TODO: not using bot_token since then private channels cannot be archived + # with self.client.start(bot_token=self.bot_token): + with self.client.start(): + # self.client(ImportChatInviteRequest('4kAkN49IKJBhZDk6')) + is_private = match.group(1) == "/c" + print(f"{is_private=}") + chat = int(match.group(2)) if is_private else match.group(2) + post_id = int(match.group(3)) + + try: + post = self.client.get_messages(chat, ids=post_id) + except ValueError as e: + logger.error(f"Could not fetch telegram {url} possibly it's private: {e}") + return False + except ChannelInvalidError as e: + logger.error(f"Could not fetch telegram {url}. This error can be fixed if you setup a bot_token in addition to api_id and api_hash: {e}") + return False + + if post is None: return False + print(post) diff --git a/src/configs/v2config.py b/src/configs/v2config.py index 50c8b0f..75a125e 100644 --- a/src/configs/v2config.py +++ b/src/configs/v2config.py @@ -3,9 +3,9 @@ import argparse, yaml from dataclasses import dataclass, field from typing import List -from feeders.feeder import Feeder +from archivers import Archiverv2 +from feeders import Feeder from steps.step import Step -from utils import Util from enrichers import Enricher from collections import defaultdict @@ -16,10 +16,11 @@ class ConfigV2: configurable_parents = [ Feeder, Enricher, + Archiverv2, # Util ] feeder: Step # TODO:= BaseFeeder - archivers: List[Step] = field(default_factory=[]) # TODO: fix type + archivers: List[Archiverv2] = field(default_factory=[]) # TODO: fix type enrichers: List[Enricher] = field(default_factory=[]) formatters: List[Step] = field(default_factory=[]) # TODO: fix type storages: List[Step] = field(default_factory=[]) # TODO: fix type @@ -48,7 +49,7 @@ class ConfigV2: assert "." not in child.name, f"class prop name cannot contain dots('.'): {child.name}" assert "." not in config, f"config property cannot contain dots('.'): {config}" config_path = f"{child.name}.{config}" - parser.add_argument(f'--{config_path}', action='store', dest=config_path, help=details['help']) + parser.add_argument(f'--{config_path}', action='store', dest=config_path, help=f"{details['help']} (defaults to {details['default']})") self.defaults[config_path] = details["default"] if "cli_set" in details: self.cli_ops[config_path] = details["cli_set"] @@ -82,9 +83,11 @@ class ConfigV2: self.feeder = Feeder.init(steps.get("feeder", "cli_feeder"), self.config) self.enrichers = [Enricher.init(e, self.config) for e in steps.get("enrichers", [])] + self.archivers = [Archiverv2.init(e, self.config) for e in steps.get("archivers", [])] print("feeder", self.feeder) print("enrichers", [e for e in self.enrichers]) + print("archivers", [e for e in self.archivers]) def validate(self): pass diff --git a/src/enrichers/enricher_screenshot.py b/src/enrichers/enricher_screenshot.py index 04a2bf0..5018859 100644 --- a/src/enrichers/enricher_screenshot.py +++ b/src/enrichers/enricher_screenshot.py @@ -1,6 +1,9 @@ +from utils import Webdriver from . import Enricher from metadata import Metadata from loguru import logger +from selenium.common.exceptions import TimeoutException +import time class ScreenshotEnricher(Enricher): @@ -11,43 +14,19 @@ class ScreenshotEnricher(Enricher): return { "width": {"default": 1280, "help": "width of the screenshots"}, "height": {"default": 720, "help": "height of the screenshots"}, + "timeout": {"default": 60, "help": "timeout for taking the screenshot"} } def enrich(self, item: Metadata) -> Metadata: url = self.get_url(item) - print("enrich") - # driver = config.webdriver - # with driver as Webdriver(): # TODO: make a util - # #TODO: take screenshot - # pass + print(f"enriching {url=}") + with Webdriver(self.width, self.height, self.timeout, 'facebook.com' in url) as driver: # TODO: make a util + try: + driver.get(url) + time.sleep(2) + except TimeoutException: + logger.info("TimeoutException loading page for screenshot") - # logger.debug(f"getting screenshot for {url=}") - # key = self._get_key_from_url(url, ".png", append_datetime=True) - # filename = os.path.join(Storage.TMP_FOLDER, key) - - # # Accept cookies popup dismiss for ytdlp video - # if 'facebook.com' in url: - # try: - # logger.debug(f'Trying fb click accept cookie popup for {url}') - # self.driver.get("http://www.facebook.com") - # foo = self.driver.find_element(By.XPATH, "//button[@data-cookiebanner='accept_only_essential_button']") - # foo.click() - # logger.debug(f'fb click worked') - # # linux server needs a sleep otherwise facebook cookie won't have worked and we'll get a popup on next page - # time.sleep(2) - # except: - # logger.warning(f'Failed on fb accept cookies for url {url}') - - # try: - # self.driver.get(url) - # time.sleep(6) - # except TimeoutException: - # logger.info("TimeoutException loading page for screenshot") - - # self.driver.save_screenshot(filename) - # self.storage.upload(filename, key, extra_args={'ACL': 'public-read', 'ContentType': 'image/png'}) - - # cdn_url = self.storage.get_cdn_url(key) - # self.add_to_media(cdn_url, key) - - # return cdn_url + #TODO: return saved object + driver.save_screenshot("TODO-HASH_OR_UUID.png") + return None diff --git a/src/feeders/feeder_gsheet.py b/src/feeders/feeder_gsheet.py index a99376f..ad28af1 100644 --- a/src/feeders/feeder_gsheet.py +++ b/src/feeders/feeder_gsheet.py @@ -4,7 +4,7 @@ import json, gspread from loguru import logger # from . import Enricher -from feeders.feeder import Feeder +from feeders import Feeder from steps.gsheet import Gsheets from utils import GWorksheet @@ -30,7 +30,7 @@ class GsheetsFeeder(Gsheets, Feeder): }, "block_worksheets": { "default": set(), - "help": "(CSV) explicitly block some worksheets from being processed, defaults to empty", + "help": "(CSV) explicitly block some worksheets from being processed", "cli_set": lambda cli_val, cur_val: set(cli_val.split(",")) } }) diff --git a/src/metadata.py b/src/metadata.py index 39b62ff..d56fcd9 100644 --- a/src/metadata.py +++ b/src/metadata.py @@ -1,6 +1,6 @@ from __future__ import annotations -from typing import Union, Dict +from typing import Any, Union, Dict from dataclasses import dataclass @@ -12,18 +12,28 @@ class Metadata: # title: str # url: str # hash: str - metadata: Dict[str, Metadata] + metadata: Dict[str, Any] - @staticmethod - def merge(left: Metadata, right: Metadata, overwrite_left=True) -> Metadata: + # TODO: remove and use default? + def __init__(self) -> None: + self.status = "" + self.metadata = {} + + # @staticmethod + def merge(self: Metadata, right: Metadata, overwrite_left=True) -> Metadata: # should return a merged version of the Metadata # will work for archived() and enriched() # what if 2 metadatas contain the same keys? only one can remain! : overwrite_left pass - def get(self, key: str) -> Union[Metadata, str]: + # TODO: setters? + def set(self, key: str, val: Any) -> Union[Metadata, str]: # goes through metadata and returns the Metadata available - pass + self.metadata[key] = val + + def get(self, key: str, default: Any = None) -> Union[Metadata, str]: + # goes through metadata and returns the Metadata available + return self.metadata.get(key, default) def as_json(self) -> str: # converts all metadata and data into JSON diff --git a/src/orchestrator.py b/src/orchestrator.py index f32f4c9..5889497 100644 --- a/src/orchestrator.py +++ b/src/orchestrator.py @@ -1,8 +1,11 @@ from __future__ import annotations +from ast import List from typing import Union, Dict from dataclasses import dataclass +from archivers.archiver import Archiverv2 from enrichers.enricher import Enricher +from metadata import Metadata """ how not to couple the different pieces of logic @@ -108,12 +111,13 @@ Once an archiver returns a link to a local file (for eg to a storage), how do we The context metadata should include a temporary folder (maybe a LocalStorage instance?) """ + class ArchivingOrchestrator: def __init__(self, config) -> None: # in config.py we should test that the archivers exist and log mismatches (blocking execution) # identify each formatter, storage, database, etc # self.feeder = Feeder.init(config.feeder, config.get(config.feeder)) - + # Is it possible to overwrite config.yaml values? it could be useful: share config file and modify gsheets_feeder.sheet via CLI # where does that update/processing happen? in config.py # reflection for Archiver to know wihch child classes it has? use Archiver.__subclasses__ @@ -123,7 +127,13 @@ class ArchivingOrchestrator: # ] self.feeder = config.feeder self.enrichers = config.enrichers + self.archivers: List[Archiverv2] = config.archivers + for a in self.archivers: a.setup() + + self.formatters = [] + self.storages = [] + self.databases = [] # self.formatters = [ # Formatter.init(f, config) # for f in config.formatters @@ -145,30 +155,33 @@ class ArchivingOrchestrator: def feed(self) -> list(ArchiveResult): for url in self.feeder: print("ARCHIVING", url) - # self.archive(url) + self.archive(url) # how does this handle the parameters like folder which can be different for each archiver? # the storage needs to know where to archive!! - # solution: feeders have context: extra metadata that they can read or ignore, + # solution: feeders have context: extra metadata that they can read or ignore, # all of it should have sensible defaults (eg: folder) # default feeder is a list with 1 element def archive(self, url) -> Union[ArchiveResult, None]: - url = clear_url(url) - result = Metadata(url=url) - + # TODO: + # url = clear_url(url) + # result = Metadata(url=url) + result = Metadata() + result.set("url", url) should_archive = True - for d in databases: should_archive &= d.should_process(url) + for d in self.databases: should_archive &= d.should_process(url) # should storages also be able to check? - for s in storages: should_archive &= s.should_process(url) + for s in self.storages: should_archive &= s.should_process(url) if not should_archive: + print("skipping") return "skipping" # signal to DB that archiving has started - for d in databases: + for d in self.databases: # are the databases to decide whether to archive? - # they can simply return True by default, otherwise they can avoid duplicates. should this logic be more granular, for example on the archiver level: a tweet will not need be scraped twice, whereas an instagram profile might. the archiver could not decide from the link which parts to archive, + # they can simply return True by default, otherwise they can avoid duplicates. should this logic be more granular, for example on the archiver level: a tweet will not need be scraped twice, whereas an instagram profile might. the archiver could not decide from the link which parts to archive, # instagram profile example: it would always re-archive everything # maybe the database/storage could use a hash/key to decide if there's a need to re-archive if d.should_process(url): @@ -180,15 +193,15 @@ class ArchivingOrchestrator: return # vk, telethon, ... - for a in archivers: + for a in self.archivers: # with automatic try/catch in download + archived (+ the other ops below) - # should the archivers come with the config already? are there configs which change at runtime? + # should the archivers come with the config already? are there configs which change at runtime? # think not, so no need to pass config as parameter - # do they need to be refreshed with every execution? + # do they need to be refreshed with every execution? # this is where the Hashes come from, the place with access to all content # the archiver does not have access to storage - result.update(a.download(url)) - if result.is_success(): break + result.merge(a.download(result)) + if True or result.is_success(): break # what if an archiver returns multiple entries and one is to be part of HTMLgenerator? # should it call the HTMLgenerator as if it's not an enrichment? @@ -196,20 +209,20 @@ class ArchivingOrchestrator: # then how to execute it last? should there also be post-processors? are there other examples? # maybe as a PDF? or a Markdown file # side captures: screenshot, wacz, webarchive, thumbnails, HTMLgenerator - for e in enrichers: - result.update(e.enrich(result)) + for e in self.enrichers: + result.merge(e.enrich(result)) # formatters, enrichers, and storages will sometimes look for specific properties: eg
  • Screenshot:
  • - for p in formatter: - result.update(p.process(result)) + for f in self.formatters: + result.merge(f.format(result)) # storages - for s in storages: + for s in self.storages: for m in result.media: - m.update(s.store(m)) + m.merge(s.store(m)) # signal completion to databases (DBs, Google Sheets, CSV, ...) # a hash registration service could be one database: forensic archiving - for d in databases: d.done( result) + for d in self.databases: d.done(result) - return result \ No newline at end of file + return result diff --git a/src/steps/gsheet.py b/src/steps/gsheet.py index 9654da4..279c036 100644 --- a/src/steps/gsheet.py +++ b/src/steps/gsheet.py @@ -36,7 +36,7 @@ class Gsheets(Step): 'wacz': 'wacz', 'replaywebpage': 'replaywebpage', }, - "help": "names of columns in the google sheet", + "help": "names of columns in the google sheet (stringified JSON object)", "cli_set": lambda cli_val, cur_val: dict(cur_val, **json.loads(cli_val)) }, } \ No newline at end of file diff --git a/src/utils/__init__.py b/src/utils/__init__.py index ad56f36..9aff525 100644 --- a/src/utils/__init__.py +++ b/src/utils/__init__.py @@ -1,4 +1,5 @@ # we need to explicitly expose the available imports here from .gworksheet import GWorksheet from .misc import * -from .util import Util \ No newline at end of file +from .util import Util +from .webdriver import Webdriver \ No newline at end of file diff --git a/src/utils/util.py b/src/utils/util.py index 714d499..e465bda 100644 --- a/src/utils/util.py +++ b/src/utils/util.py @@ -1,11 +1,12 @@ from __future__ import annotations +from abc import abstractmethod from dataclasses import dataclass -from abc import abstractmethod, ABC from metadata import Metadata from steps.step import Step +#TODO: likely unused @dataclass -class Util(Step, ABC): +class Util(Step): name = "util" def __init__(self, config: dict) -> None: diff --git a/src/utils/webdriver.py b/src/utils/webdriver.py new file mode 100644 index 0000000..5ce0374 --- /dev/null +++ b/src/utils/webdriver.py @@ -0,0 +1,45 @@ +from __future__ import annotations +from selenium import webdriver +from selenium.common.exceptions import TimeoutException +from loguru import logger +from selenium.webdriver.common.by import By +import time + + +class Webdriver: + def __init__(self, width: int, height: int, timeout_seconds: int, facebook_accept_cookies: bool = False) -> webdriver: + self.width = width + self.height = height + self.timeout_seconds = timeout_seconds + self.facebook_accept_cookies = facebook_accept_cookies + + def __enter__(self) -> webdriver: + options = webdriver.FirefoxOptions() + options.headless = True + options.set_preference('network.protocol-handler.external.tg', False) + try: + self.driver = webdriver.Firefox(options=options) + self.driver.set_window_size(self.width, self.height) + self.driver.set_page_load_timeout(self.timeout_seconds) + except TimeoutException as e: + logger.error(f"failed to get new webdriver, possibly due to insufficient system resources or timeout settings: {e}") + + if self.facebook_accept_cookies: + try: + logger.debug(f'Trying fb click accept cookie popup.') + self.driver.get("http://www.facebook.com") + foo = self.driver.find_element(By.XPATH, "//button[@data-cookiebanner='accept_only_essential_button']") + foo.click() + logger.debug(f'fb click worked') + # linux server needs a sleep otherwise facebook cookie won't have worked and we'll get a popup on next page + time.sleep(2) + except: + logger.warning(f'Failed on fb accept cookies.') + + return self.driver + + def __exit__(self, exc_type, exc_val, exc_tb): + self.driver.close() + self.driver.quit() + del self.driver + return True From 53ffa2d4aee2e86b181eb8e2f19ff1fc33ac3456 Mon Sep 17 00:00:00 2001 From: msramalho <19508417+msramalho@users.noreply.github.com> Date: Wed, 14 Dec 2022 15:37:34 +0000 Subject: [PATCH 026/190] telethon_archiver working for multiple media --- src/archivers/archiver.py | 21 ++++++- src/archivers/telethon_archiverv2.py | 86 ++++++++++++++++++++++------ src/metadata.py | 32 ++++++++++- src/orchestrator.py | 12 +++- src/steps/step.py | 5 -- 5 files changed, 125 insertions(+), 31 deletions(-) diff --git a/src/archivers/archiver.py b/src/archivers/archiver.py index 538804e..37f5d4d 100644 --- a/src/archivers/archiver.py +++ b/src/archivers/archiver.py @@ -3,6 +3,7 @@ from abc import abstractmethod from dataclasses import dataclass from metadata import Metadata from steps.step import Step +import mimetypes, requests @dataclass @@ -12,9 +13,9 @@ class Archiverv2(Step): def __init__(self, config: dict) -> None: # without this STEP.__init__ is not called super().__init__(config) - # self.setup() # only for typing... + def init(name: str, config: dict) -> Archiverv2: return Step.init(name, config, Archiverv2) @@ -22,5 +23,23 @@ class Archiverv2(Step): # used when archivers need to login or do other one-time setup pass + def _guess_file_type(self, path: str) -> str: + """ + Receives a URL or filename and returns global mimetype like 'image' or 'video' + see https://developer.mozilla.org/en-US/docs/Web/HTTP/Basics_of_HTTP/MIME_types/Common_types + """ + mime = mimetypes.guess_type(path)[0] + if mime is not None: + return mime.split("/")[0] + return "" + + def download_from_url(self, url:str, to_filename:str) -> None: + headers = { + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36' + } + d = requests.get(url, headers=headers) + with open(to_filename, 'wb') as f: + f.write(d.content) + @abstractmethod def download(self, item: Metadata) -> Metadata: pass diff --git a/src/archivers/telethon_archiverv2.py b/src/archivers/telethon_archiverv2.py index a4273b0..267dc2d 100644 --- a/src/archivers/telethon_archiverv2.py +++ b/src/archivers/telethon_archiverv2.py @@ -7,8 +7,7 @@ from telethon.tl.functions.messages import ImportChatInviteRequest from telethon.errors.rpcerrorlist import UserAlreadyParticipantError, FloodWaitError, InviteRequestSentError, InviteHashExpiredError from loguru import logger from tqdm import tqdm -import re, time, json - +import re, time, json, os class TelethonArchiver(Archiverv2): @@ -38,6 +37,10 @@ class TelethonArchiver(Archiverv2): } def setup(self) -> None: + """ + 1. trigger login process for telegram or proceed if already saved in a session file + 2. joins channel_invites where needed + """ logger.info(f"SETUP {self.name} checking login...") with self.client.start(): pass @@ -56,11 +59,11 @@ class TelethonArchiver(Archiverv2): channel_id = channel_invite.get("id", False) invite = channel_invite["invite"] if (match := self.invite_pattern.search(invite)): - try: + try: if channel_id: - ent = self.client.get_entity(int(channel_id)) # fails if not a member + ent = self.client.get_entity(int(channel_id)) # fails if not a member else: - ent = self.client.get_entity(invite) # fails if not a member + ent = self.client.get_entity(invite) # fails if not a member logger.warning(f"please add the property id='{ent.id}' to the 'channel_invites' configuration where {invite=}, not doing so can lead to a minutes-long setup time due to telegram's rate limiting.") except ValueError as e: logger.info(f"joining new channel {invite=}") @@ -80,35 +83,80 @@ class TelethonArchiver(Archiverv2): continue else: logger.warning(f"Invalid invite link {invite}") - i+=1 + i += 1 pbar.update() - def download(self, item: Metadata) -> Metadata: - url = self.get_url(item) + url = item.get_url() + print(f"downloading {url=}") # detect URLs that we definitely cannot handle match = self.link_pattern.search(url) if not match: return False - # app will ask (stall for user input!) for phone number and auth code if anon.session not found - # TODO: not using bot_token since then private channels cannot be archived - # with self.client.start(bot_token=self.bot_token): - with self.client.start(): - # self.client(ImportChatInviteRequest('4kAkN49IKJBhZDk6')) - is_private = match.group(1) == "/c" - print(f"{is_private=}") - chat = int(match.group(2)) if is_private else match.group(2) - post_id = int(match.group(3)) + is_private = match.group(1) == "/c" + chat = int(match.group(2)) if is_private else match.group(2) + post_id = int(match.group(3)) + result = Metadata() + + # NB: not using bot_token since then private channels cannot be archived: self.client.start(bot_token=self.bot_token) + with self.client.start(): try: post = self.client.get_messages(chat, ids=post_id) except ValueError as e: logger.error(f"Could not fetch telegram {url} possibly it's private: {e}") return False except ChannelInvalidError as e: - logger.error(f"Could not fetch telegram {url}. This error can be fixed if you setup a bot_token in addition to api_id and api_hash: {e}") + logger.error(f"Could not fetch telegram {url}. This error may be fixed if you setup a bot_token in addition to api_id and api_hash (but then private channels will not be archived, we need to update this logic to handle both): {e}") return False if post is None: return False - print(post) + logger.info(f"fetched telegram {post.id=}") + + media_posts = self._get_media_posts_in_group(chat, post) + logger.debug(f'got {len(media_posts)=} for {url=}') + + tmp_dir = item.get("tmp_dir") + + group_id = post.grouped_id if post.grouped_id is not None else post.id + title = post.message + for mp in media_posts: + if len(mp.message) > len(title): title = mp.message # save the longest text found (usually only 1) + + # media can also be in entities + if mp.entities: + other_media_urls = [e.url for e in mp.entities if hasattr(e, "url") and e.url and self._guess_file_type(e.url) in ["video", "image"]] + logger.debug(f"Got {len(other_media_urls)} other medial urls from {mp.id=}: {other_media_urls}") + for om_url in other_media_urls: + filename = os.path.join(tmp_dir, f'{chat}_{group_id}_{self._get_key_from_url(om_url)}') + self.download_from_url(om_url, filename) + result.add_media(filename) + + filename_dest = os.path.join(tmp_dir, f'{chat}_{group_id}', str(mp.id)) + filename = self.client.download_media(mp.media, filename_dest) + if not filename: + logger.debug(f"Empty media found, skipping {str(mp)=}") + continue + result.add_media(filename) + + result.set("post", post).set_title(title).set_timestamp(post.date) + return result + + def _get_media_posts_in_group(self, chat, original_post, max_amp=10): + """ + Searches for Telegram posts that are part of the same group of uploads + The search is conducted around the id of the original post with an amplitude + of `max_amp` both ways + Returns a list of [post] where each post has media and is in the same grouped_id + """ + if getattr(original_post, "grouped_id", None) is None: + return [original_post] if getattr(original_post, "media", False) else [] + + search_ids = [i for i in range(original_post.id - max_amp, original_post.id + max_amp + 1)] + posts = self.client.get_messages(chat, ids=search_ids) + media = [] + for post in posts: + if post is not None and post.grouped_id == original_post.grouped_id and post.media is not None: + media.append(post) + return media diff --git a/src/metadata.py b/src/metadata.py index d56fcd9..e1e8d8b 100644 --- a/src/metadata.py +++ b/src/metadata.py @@ -1,7 +1,9 @@ from __future__ import annotations +from ast import List from typing import Any, Union, Dict from dataclasses import dataclass +from datetime import datetime @dataclass @@ -15,8 +17,8 @@ class Metadata: metadata: Dict[str, Any] # TODO: remove and use default? - def __init__(self) -> None: - self.status = "" + def __init__(self, status="") -> None: + self.status = status self.metadata = {} # @staticmethod @@ -27,14 +29,38 @@ class Metadata: pass # TODO: setters? - def set(self, key: str, val: Any) -> Union[Metadata, str]: + def set(self, key: str, val: Any) -> Metadata: # goes through metadata and returns the Metadata available self.metadata[key] = val + return self def get(self, key: str, default: Any = None) -> Union[Metadata, str]: # goes through metadata and returns the Metadata available return self.metadata.get(key, default) +# custom getter/setters + + def set_url(self, url: str) -> Metadata: + assert type(url) is str and len(url) > 0, "invalid URL" + return self.set("url", url) + + def get_url(self) -> str: + url = self.get("url") + assert type(url) is str and len(url) > 0, "invalid URL" + return url + + def get_media(self) -> List: + return self.get("media", []) + + def set_title(self, title: str) -> Metadata: + return self.set("title", title) + + def set_timestamp(self, title: datetime) -> Metadata: + return self.set("title", title) + + def add_media(self, filename: str) -> Metadata: + return self.get_media().append(filename) + def as_json(self) -> str: # converts all metadata and data into JSON pass diff --git a/src/orchestrator.py b/src/orchestrator.py index 5889497..804948e 100644 --- a/src/orchestrator.py +++ b/src/orchestrator.py @@ -6,6 +6,7 @@ from archivers.archiver import Archiverv2 from enrichers.enricher import Enricher from metadata import Metadata +import tempfile, time """ how not to couple the different pieces of logic @@ -155,19 +156,24 @@ class ArchivingOrchestrator: def feed(self) -> list(ArchiveResult): for url in self.feeder: print("ARCHIVING", url) - self.archive(url) + with tempfile.TemporaryDirectory(dir="./") as tmp_dir: + self.archive(url, tmp_dir) + + print("holding on") + time.sleep(300) # how does this handle the parameters like folder which can be different for each archiver? # the storage needs to know where to archive!! # solution: feeders have context: extra metadata that they can read or ignore, # all of it should have sensible defaults (eg: folder) # default feeder is a list with 1 element - def archive(self, url) -> Union[ArchiveResult, None]: + def archive(self, url: str, tmp_dir: str) -> Union[Metadata, None]: # TODO: # url = clear_url(url) # result = Metadata(url=url) result = Metadata() - result.set("url", url) + result.set_url(url) + result.set("tmp_dir", tmp_dir) should_archive = True for d in self.databases: should_archive &= d.should_process(url) diff --git a/src/steps/step.py b/src/steps/step.py index 04d7a61..4d7e6c1 100644 --- a/src/steps/step.py +++ b/src/steps/step.py @@ -29,8 +29,3 @@ class Step(ABC): print(sub.name, "CALLING NEW") return sub(config) raise ClassFoundException(f"Unable to initialize STEP with {name=}") - - def get_url(self, item: Metadata) -> str: - url = item.get("url") - assert type(url) is str and len(url) > 0 - return url From 9c056d001c3cf5e0d26156e07ad62fe7d1bbecc3 Mon Sep 17 00:00:00 2001 From: msramalho <19508417+msramalho@users.noreply.github.com> Date: Wed, 14 Dec 2022 16:11:06 +0000 Subject: [PATCH 027/190] merge logic started --- src/archivers/telethon_archiverv2.py | 13 ++++++---- src/metadata.py | 38 ++++++++++++++++++++++------ src/orchestrator.py | 5 ++-- src/steps/step.py | 4 +-- 4 files changed, 43 insertions(+), 17 deletions(-) diff --git a/src/archivers/telethon_archiverv2.py b/src/archivers/telethon_archiverv2.py index 267dc2d..4fa3ce0 100644 --- a/src/archivers/telethon_archiverv2.py +++ b/src/archivers/telethon_archiverv2.py @@ -87,9 +87,11 @@ class TelethonArchiver(Archiverv2): pbar.update() def download(self, item: Metadata) -> Metadata: + """ + if this url is archivable will download post info and look for other posts from the same group with media. + can handle private/public channels + """ url = item.get_url() - - print(f"downloading {url=}") # detect URLs that we definitely cannot handle match = self.link_pattern.search(url) if not match: return False @@ -126,8 +128,9 @@ class TelethonArchiver(Archiverv2): # media can also be in entities if mp.entities: - other_media_urls = [e.url for e in mp.entities if hasattr(e, "url") and e.url and self._guess_file_type(e.url) in ["video", "image"]] - logger.debug(f"Got {len(other_media_urls)} other medial urls from {mp.id=}: {other_media_urls}") + other_media_urls = [e.url for e in mp.entities if hasattr(e, "url") and e.url and self._guess_file_type(e.url) in ["video", "image", "audio"]] + if len(other_media_urls): + logger.debug(f"Got {len(other_media_urls)} other medial urls from {mp.id=}: {other_media_urls}") for om_url in other_media_urls: filename = os.path.join(tmp_dir, f'{chat}_{group_id}_{self._get_key_from_url(om_url)}') self.download_from_url(om_url, filename) @@ -140,7 +143,7 @@ class TelethonArchiver(Archiverv2): continue result.add_media(filename) - result.set("post", post).set_title(title).set_timestamp(post.date) + result.set("post", str(post)).set_title(title).set_timestamp(post.date) return result def _get_media_posts_in_group(self, chat, original_post, max_amp=10): diff --git a/src/metadata.py b/src/metadata.py index e1e8d8b..193003f 100644 --- a/src/metadata.py +++ b/src/metadata.py @@ -4,6 +4,7 @@ from ast import List from typing import Any, Union, Dict from dataclasses import dataclass from datetime import datetime +import json @dataclass @@ -21,12 +22,25 @@ class Metadata: self.status = status self.metadata = {} - # @staticmethod def merge(self: Metadata, right: Metadata, overwrite_left=True) -> Metadata: - # should return a merged version of the Metadata - # will work for archived() and enriched() - # what if 2 metadatas contain the same keys? only one can remain! : overwrite_left - pass + """ + merges to Metadata instances, will overwrite according to overwrite_left flag + """ + res = Metadata() + if overwrite_left: + res.status = right.status + res.metadata = dict(self.metadata) # make a copy + for k, v in right.metadata.items(): + print(type(v), type(self.get(k))) + # assert type(v) == type(self.get(k)) + if type(v) not in [dict, list, set] or k not in res.metadata: + res.set(k, v) + else: # key conflict + if type(v) in [dict, set]: res.set(k, self.get(k) | v) + elif type(v) == list: res.set(k, self.get(k) + v) + else: # invert and do same logic + return right.merge(self) + return res # TODO: setters? def set(self, key: str, val: Any) -> Metadata: @@ -34,8 +48,10 @@ class Metadata: self.metadata[key] = val return self - def get(self, key: str, default: Any = None) -> Union[Metadata, str]: + def get(self, key: str, default: Any = None, create_if_missing=False) -> Union[Metadata, str]: # goes through metadata and returns the Metadata available + if create_if_missing and key not in self.metadata: + self.metadata[key] = default return self.metadata.get(key, default) # custom getter/setters @@ -50,7 +66,11 @@ class Metadata: return url def get_media(self) -> List: - return self.get("media", []) + return self.get("media", [], create_if_missing=True) + + def set_content(self, content: str) -> Metadata: + # the main textual content/information from a social media post, webpage, ... + return self.set("content", content) def set_title(self, title: str) -> Metadata: return self.set("title", title) @@ -59,8 +79,10 @@ class Metadata: return self.set("title", title) def add_media(self, filename: str) -> Metadata: + # print(f"adding {filename} to {self.metadata.get('media')}") + # return self.set("media", self.get_media() + [filename]) return self.get_media().append(filename) def as_json(self) -> str: # converts all metadata and data into JSON - pass + return json.dumps(self.metadata) diff --git a/src/orchestrator.py b/src/orchestrator.py index 804948e..9a523bf 100644 --- a/src/orchestrator.py +++ b/src/orchestrator.py @@ -157,8 +157,9 @@ class ArchivingOrchestrator: for url in self.feeder: print("ARCHIVING", url) with tempfile.TemporaryDirectory(dir="./") as tmp_dir: - self.archive(url, tmp_dir) - + result = self.archive(url, tmp_dir) + print(result) + print(result.as_json()) print("holding on") time.sleep(300) # how does this handle the parameters like folder which can be different for each archiver? diff --git a/src/steps/step.py b/src/steps/step.py index 4d7e6c1..7a2135c 100644 --- a/src/steps/step.py +++ b/src/steps/step.py @@ -22,10 +22,10 @@ class Step(ABC): def init(name: str, config: dict, child: Type[Step]) -> Step: """ - cannot find subclasses of child.subclasses + looks into direct subclasses of child for name and returns such ab object + TODO: cannot find subclasses of child.subclasses """ for sub in child.__subclasses__(): if sub.name == name: - print(sub.name, "CALLING NEW") return sub(config) raise ClassFoundException(f"Unable to initialize STEP with {name=}") From 96845305a3ff265fe2690c91ab577fef659a7992 Mon Sep 17 00:00:00 2001 From: msramalho <19508417+msramalho@users.noreply.github.com> Date: Wed, 14 Dec 2022 19:01:20 +0000 Subject: [PATCH 028/190] media concept implemented --- src/archivers/telethon_archiverv2.py | 10 ++-- src/media.py | 17 +++++++ src/metadata.py | 63 +++++++++++--------------- src/orchestrator.py | 7 +-- src/storages/__init__.py | 4 +- src/storages/s3.py | 68 ++++++++++++++++++++++++++++ src/storages/storage.py | 21 +++++++++ 7 files changed, 145 insertions(+), 45 deletions(-) create mode 100644 src/media.py create mode 100644 src/storages/s3.py create mode 100644 src/storages/storage.py diff --git a/src/archivers/telethon_archiverv2.py b/src/archivers/telethon_archiverv2.py index 4fa3ce0..ea19c92 100644 --- a/src/archivers/telethon_archiverv2.py +++ b/src/archivers/telethon_archiverv2.py @@ -9,6 +9,8 @@ from loguru import logger from tqdm import tqdm import re, time, json, os +from media import Media + class TelethonArchiver(Archiverv2): name = "telethon" @@ -131,17 +133,17 @@ class TelethonArchiver(Archiverv2): other_media_urls = [e.url for e in mp.entities if hasattr(e, "url") and e.url and self._guess_file_type(e.url) in ["video", "image", "audio"]] if len(other_media_urls): logger.debug(f"Got {len(other_media_urls)} other medial urls from {mp.id=}: {other_media_urls}") - for om_url in other_media_urls: - filename = os.path.join(tmp_dir, f'{chat}_{group_id}_{self._get_key_from_url(om_url)}') + for i, om_url in enumerate(other_media_urls): + filename = os.path.join(tmp_dir, f'{chat}_{group_id}_{i}') self.download_from_url(om_url, filename) - result.add_media(filename) + result.add_media(Media(filename)) filename_dest = os.path.join(tmp_dir, f'{chat}_{group_id}', str(mp.id)) filename = self.client.download_media(mp.media, filename_dest) if not filename: logger.debug(f"Empty media found, skipping {str(mp)=}") continue - result.add_media(filename) + result.add_media(Media(filename)) result.set("post", str(post)).set_title(title).set_timestamp(post.date) return result diff --git a/src/media.py b/src/media.py new file mode 100644 index 0000000..ecee4f4 --- /dev/null +++ b/src/media.py @@ -0,0 +1,17 @@ + +from __future__ import annotations +from ast import List +from typing import Any, Union, Dict +from dataclasses import dataclass +from datetime import datetime +import json + + +@dataclass +class Media: + filename: str + id: str = None + hash: str = None + cdn_url: str = None + hash: str = None + diff --git a/src/metadata.py b/src/metadata.py index 193003f..8945e1a 100644 --- a/src/metadata.py +++ b/src/metadata.py @@ -2,49 +2,38 @@ from __future__ import annotations from ast import List from typing import Any, Union, Dict -from dataclasses import dataclass +from dataclasses import dataclass, field from datetime import datetime -import json +# import json + +from media import Media @dataclass class Metadata: - # does not handle files, only primitives - # the only piece of logic to handle files is the archiver, enricher, and storage - status: str - # title: str - # url: str - # hash: str - metadata: Dict[str, Any] - - # TODO: remove and use default? - def __init__(self, status="") -> None: - self.status = status - self.metadata = {} + status: str = "" + metadata: Dict[str, Any] = field(default_factory=dict) + media: List[Media] = field(default_factory=list) def merge(self: Metadata, right: Metadata, overwrite_left=True) -> Metadata: """ - merges to Metadata instances, will overwrite according to overwrite_left flag + merges two Metadata instances, will overwrite according to overwrite_left flag """ - res = Metadata() if overwrite_left: - res.status = right.status - res.metadata = dict(self.metadata) # make a copy + self.status = right.status for k, v in right.metadata.items(): - print(type(v), type(self.get(k))) - # assert type(v) == type(self.get(k)) - if type(v) not in [dict, list, set] or k not in res.metadata: - res.set(k, v) + assert k not in self.metadata or type(v) == type(self.get(k)) + if type(v) not in [dict, list, set] or k not in self.metadata: + self.set(k, v) else: # key conflict - if type(v) in [dict, set]: res.set(k, self.get(k) | v) - elif type(v) == list: res.set(k, self.get(k) + v) + if type(v) in [dict, set]: self.set(k, self.get(k) | v) + elif type(v) == list: self.set(k, self.get(k) + v) + self.media.extend(right.media) else: # invert and do same logic return right.merge(self) - return res + return self - # TODO: setters? def set(self, key: str, val: Any) -> Metadata: - # goes through metadata and returns the Metadata available self.metadata[key] = val return self @@ -65,9 +54,6 @@ class Metadata: assert type(url) is str and len(url) > 0, "invalid URL" return url - def get_media(self) -> List: - return self.get("media", [], create_if_missing=True) - def set_content(self, content: str) -> Metadata: # the main textual content/information from a social media post, webpage, ... return self.set("content", content) @@ -75,14 +61,17 @@ class Metadata: def set_title(self, title: str) -> Metadata: return self.set("title", title) - def set_timestamp(self, title: datetime) -> Metadata: - return self.set("title", title) + def set_timestamp(self, timestamp: datetime) -> Metadata: + assert type(timestamp) == datetime, "set_timestamp expects a datetime instance" + return self.set("timestamp", timestamp) - def add_media(self, filename: str) -> Metadata: + def add_media(self, media: Media) -> Metadata: # print(f"adding {filename} to {self.metadata.get('media')}") # return self.set("media", self.get_media() + [filename]) - return self.get_media().append(filename) + # return self.get_media().append(media) + return self.media.append(media) - def as_json(self) -> str: - # converts all metadata and data into JSON - return json.dumps(self.metadata) + # def as_json(self) -> str: + # # converts all metadata and data into JSON + # return json.dumps(self.metadata) + # #TODO: datetime is not serializable diff --git a/src/orchestrator.py b/src/orchestrator.py index 9a523bf..2f33370 100644 --- a/src/orchestrator.py +++ b/src/orchestrator.py @@ -153,13 +153,14 @@ class ArchivingOrchestrator: # these rules are checked in config.py # assert len(archivers) > 1, "there needs to be at least one Archiver" - def feed(self) -> list(ArchiveResult): + def feed(self) -> list(Metadata): for url in self.feeder: print("ARCHIVING", url) with tempfile.TemporaryDirectory(dir="./") as tmp_dir: result = self.archive(url, tmp_dir) + print(type(result)) print(result) - print(result.as_json()) + # print(result.as_json()) print("holding on") time.sleep(300) # how does this handle the parameters like folder which can be different for each archiver? @@ -170,7 +171,7 @@ class ArchivingOrchestrator: def archive(self, url: str, tmp_dir: str) -> Union[Metadata, None]: # TODO: - # url = clear_url(url) + # url = clear_url(url) # should we save if they differ? # result = Metadata(url=url) result = Metadata() result.set_url(url) diff --git a/src/storages/__init__.py b/src/storages/__init__.py index 99f82b3..96baaba 100644 --- a/src/storages/__init__.py +++ b/src/storages/__init__.py @@ -2,4 +2,6 @@ from .base_storage import Storage from .local_storage import LocalStorage, LocalConfig from .s3_storage import S3Config, S3Storage -from .gd_storage import GDConfig, GDStorage \ No newline at end of file +from .gd_storage import GDConfig, GDStorage + +from .storage import StorageV2 \ No newline at end of file diff --git a/src/storages/s3.py b/src/storages/s3.py new file mode 100644 index 0000000..826d66d --- /dev/null +++ b/src/storages/s3.py @@ -0,0 +1,68 @@ + +from typing import IO +import boto3, uuid, os, mimetypes +from botocore.errorfactory import ClientError +from src.storages import StorageV2 +from loguru import logger +from slugify import slugify + + +class S3StorageV2(StorageV2): + name = "s3_storage" + + def __init__(self, config: dict) -> None: + super().__init__(config) + self.s3 = boto3.client( + 's3', + region_name=config.region, + endpoint_url=config.endpoint_url.format(region=config.region), + aws_access_key_id=config.key, + aws_secret_access_key=config.secret + ) + + @staticmethod + def configs() -> dict: + return { + "bucket": {"default": None, "help": "S3 bucket name"}, + "region": {"default": None, "help": "S3 region name"}, + "key": {"default": None, "help": "S3 API key"}, + "secret": {"default": None, "help": "S3 API secret"}, + # TODO: how to have sth like a custom folder? has to come from the feeders + "endpoint_url": { + "default": 'https://{region}.digitaloceanspaces.com', + "help": "S3 bucket endpoint, {region} are inserted at runtime" + }, + "cdn_url": { + "default": 'https://{bucket}.{region}.cdn.digitaloceanspaces.com/{key}', + "help": "S3 CDN url, {bucket}, {region} and {key} are inserted at runtime" + }, + "private": {"default": False, "help": "if true S3 files will not be readable online"}, + "key_path": {"default": "random", "help": "S3 file names are non-predictable strings, one of ['random', 'default']"}, + } + + def get_cdn_url(self, key: str) -> str: + return self.cdn_url.format(bucket=self.bucket, region=self.region, key=self._get_path(key)) + + def uploadf(self, file: IO[bytes], key: str, **kwargs: dict) -> None: + extra_args = kwargs.get("extra_args", {}) + if not self.private and 'ACL' not in extra_args: + extra_args['ACL'] = 'public-read' + + if 'ContentType' not in extra_args: + try: + extra_args['ContentType'] = mimetypes.guess_type(key)[0] + except Exception as e: + logger.error(f"Unable to get mimetype for {key=}, error: {e}") + + self.s3.upload_fileobj(file, Bucket=self.bucket, Key=self._get_path(key), ExtraArgs=extra_args) + + def exists(self, key: str) -> bool: + """ + Tests if a given file with key=key exists in the bucket + """ + try: + self.s3.head_object(Bucket=self.bucket, Key=self._get_path(key)) + return True + except ClientError as e: + logger.warning(f"got a ClientError when checking if {key=} exists in bucket={self.bucket}: {e}") + return False diff --git a/src/storages/storage.py b/src/storages/storage.py new file mode 100644 index 0000000..4052d7e --- /dev/null +++ b/src/storages/storage.py @@ -0,0 +1,21 @@ +from __future__ import annotations +from abc import abstractmethod +from dataclasses import dataclass +from metadata import Metadata +from steps.step import Step + + +@dataclass +class StorageV2(Step): + name = "storage" + + def __init__(self, config: dict) -> None: + # without this STEP.__init__ is not called + super().__init__(config) + + # only for typing... + def init(name: str, config: dict) -> StorageV2: + return Step.init(name, config, StorageV2) + + @abstractmethod + def store(self, item: Metadata) -> Metadata: pass From bb512b36c9c54787c4c5ddd81b6743ae6d03e927 Mon Sep 17 00:00:00 2001 From: msramalho <19508417+msramalho@users.noreply.github.com> Date: Wed, 4 Jan 2023 16:37:36 +0000 Subject: [PATCH 029/190] gsheet feeder + db WIP --- orchestration.example.yaml | 4 +- src/archivers/archiver.py | 4 + src/configs/v2config.py | 11 ++- src/databases/__init__.py | 2 + src/databases/database.py | 27 +++++- src/databases/gsheet_db.py | 64 ++++++++++++++ src/enrichers/__init__.py | 2 +- ...r_screenshot.py => screenshot_enricher.py} | 0 src/feeders/__init__.py | 2 +- src/feeders/feeder.py | 6 +- .../{feeder_gsheet.py => gsheet_feeder.py} | 26 +++--- src/metadata.py | 24 +++++- src/orchestrator.py | 83 +++++++++++-------- src/steps/step.py | 2 +- src/storages/__init__.py | 3 +- 15 files changed, 195 insertions(+), 65 deletions(-) create mode 100644 src/databases/__init__.py create mode 100644 src/databases/gsheet_db.py rename src/enrichers/{enricher_screenshot.py => screenshot_enricher.py} (100%) rename src/feeders/{feeder_gsheet.py => gsheet_feeder.py} (85%) diff --git a/orchestration.example.yaml b/orchestration.example.yaml index 7163829..caf7737 100644 --- a/orchestration.example.yaml +++ b/orchestration.example.yaml @@ -1,7 +1,7 @@ steps: # only 1 feeder allowed # a feeder could be in an "infinite loop" for example: gsheets_infinite feeder which holds-> this could be an easy logic addiction by modifying for each to while not feeder.done() if it becomes necessary - feeder: gsheets_feeder # default -> only expects URL from CLI + feeder: gsheet_feeder # default -> only expects URL from CLI archivers: # order matters - telethon # - tiktok @@ -28,7 +28,7 @@ steps: configurations: global: - save_logs: False - gsheets_feeder: + gsheet_feeder: sheet: my-auto-archiver header: 2 # defaults to 1 in GSheetsFeeder service_account: "secrets/service_account.json" diff --git a/src/archivers/archiver.py b/src/archivers/archiver.py index 37f5d4d..f16464a 100644 --- a/src/archivers/archiver.py +++ b/src/archivers/archiver.py @@ -23,6 +23,10 @@ class Archiverv2(Step): # used when archivers need to login or do other one-time setup pass + def clean_url(self, url:str) -> str: + # used to clean unnecessary URL parameters + return url + def _guess_file_type(self, path: str) -> str: """ Receives a URL or filename and returns global mimetype like 'image' or 'video' diff --git a/src/configs/v2config.py b/src/configs/v2config.py index 75a125e..b028b5e 100644 --- a/src/configs/v2config.py +++ b/src/configs/v2config.py @@ -5,6 +5,8 @@ from dataclasses import dataclass, field from typing import List from archivers import Archiverv2 from feeders import Feeder +from databases import Database +from storages import StorageV2 from steps.step import Step from enrichers import Enricher from collections import defaultdict @@ -13,10 +15,13 @@ from collections import defaultdict @dataclass class ConfigV2: # TODO: should Config inherit from Step so it can have it's own configurations? + # these are only detected if they are put to the respective __init__.py configurable_parents = [ Feeder, Enricher, Archiverv2, + Database, + StorageV2 # Util ] feeder: Step # TODO:= BaseFeeder @@ -24,14 +29,14 @@ class ConfigV2: enrichers: List[Enricher] = field(default_factory=[]) formatters: List[Step] = field(default_factory=[]) # TODO: fix type storages: List[Step] = field(default_factory=[]) # TODO: fix type - databases: List[Step] = field(default_factory=[]) # TODO: fix type + databases: List[Database] = field(default_factory=[]) def __init__(self) -> None: self.defaults = {} self.cli_ops = {} self.config = {} - # TODO: make this work for nested props like gsheets_feeder.columns.url = "URL" + # TODO: make this work for nested props like gsheet_feeder.columns.url = "URL" def parse(self): # 1. parse CLI values parser = argparse.ArgumentParser( @@ -84,10 +89,12 @@ class ConfigV2: self.feeder = Feeder.init(steps.get("feeder", "cli_feeder"), self.config) self.enrichers = [Enricher.init(e, self.config) for e in steps.get("enrichers", [])] self.archivers = [Archiverv2.init(e, self.config) for e in steps.get("archivers", [])] + self.databases = [Database.init(e, self.config) for e in steps.get("databases", [])] print("feeder", self.feeder) print("enrichers", [e for e in self.enrichers]) print("archivers", [e for e in self.archivers]) + print("databases", [e for e in self.databases]) def validate(self): pass diff --git a/src/databases/__init__.py b/src/databases/__init__.py new file mode 100644 index 0000000..17b9c6d --- /dev/null +++ b/src/databases/__init__.py @@ -0,0 +1,2 @@ +from .database import Database +from .gsheet_db import GsheetsDb \ No newline at end of file diff --git a/src/databases/database.py b/src/databases/database.py index 15f8d0d..94b2178 100644 --- a/src/databases/database.py +++ b/src/databases/database.py @@ -1,9 +1,11 @@ from __future__ import annotations from dataclasses import dataclass from abc import abstractmethod, ABC +from typing import Union from metadata import Metadata from steps.step import Step + @dataclass class Database(Step, ABC): name = "database" @@ -11,11 +13,30 @@ class Database(Step, ABC): def __init__(self, config: dict) -> None: # without this STEP.__init__ is not called super().__init__(config) - - # only for typing... def init(name: str, config: dict) -> Database: + # only for typing... return Step.init(name, config, Database) @abstractmethod - def enrich(self, item: Metadata) -> Metadata: pass + def started(self, item: Metadata) -> None: + """signals the DB that the given item archival has started""" + pass + + def failed(self, item: Metadata) -> None: + """update DB accordingly for failure""" + pass + + def aborted(self, item: Metadata) -> None: + """abort notification if user cancelled after start""" + pass + + # @abstractmethod + def fetch(self, item: Metadata) -> Union[Metadata, bool]: + """check if the given item has been archived already""" + return False + + @abstractmethod + def done(self, item: Metadata) -> None: + """archival result ready - should be saved to DB""" + pass diff --git a/src/databases/gsheet_db.py b/src/databases/gsheet_db.py new file mode 100644 index 0000000..939e851 --- /dev/null +++ b/src/databases/gsheet_db.py @@ -0,0 +1,64 @@ +from typing import Union, Tuple +import gspread + +# from metadata import Metadata +from loguru import logger + +# from . import Enricher +from databases import Database +from metadata import Metadata +from steps.gsheet import Gsheets +from utils import GWorksheet + + +class GsheetsDb(Database): + """ + NB: only works if GsheetFeeder is used. + could be updated in the future to support non-GsheetFeeder metadata + """ + name = "gsheet_db" + + def __init__(self, config: dict) -> None: + # without this STEP.__init__ is not called + super().__init__(config) + + @staticmethod + def configs() -> dict: + return {} + + def started(self, item: Metadata) -> None: + logger.warning(f"STARTED {item}") + gw, row = self._retrieve_gsheet(item) + gw.set_cell(row, 'status', 'Archive in progress') + + def failed(self, item: Metadata) -> None: + logger.error(f"FAILED {item}") + self._safe_status_update(item, 'Archive failed') + + def aborted(self, item: Metadata) -> None: + logger.warning(f"ABORTED {item}") + self._safe_status_update(item, '') + + def fetch(self, item: Metadata) -> Union[Metadata, bool]: + """check if the given item has been archived already""" + # TODO: this should not be done at the feeder stage then! + return False + + def done(self, item: Metadata) -> None: + """archival result ready - should be saved to DB""" + logger.success(f"DONE {item}") + gw, row = self._retrieve_gsheet(item) + self._safe_status_update(item, 'done') + pass + + def _safe_status_update(self, item: Metadata, new_status: str) -> None: + try: + gw, row = self._retrieve_gsheet(item) + gw.set_cell(row, 'status', new_status) + except Exception as e: + logger.debug(f"Unable to update sheet: {e}") + + def _retrieve_gsheet(self, item: Metadata) -> Tuple[GWorksheet, int]: + gw: GWorksheet = item.get("gsheet").get("worksheet") + row: int = item.get("gsheet").get("row") + return gw, row diff --git a/src/enrichers/__init__.py b/src/enrichers/__init__.py index 3c266f8..503ea2c 100644 --- a/src/enrichers/__init__.py +++ b/src/enrichers/__init__.py @@ -1,2 +1,2 @@ from .enricher import Enricher -from .enricher_screenshot import ScreenshotEnricher \ No newline at end of file +from .screenshot_enricher import ScreenshotEnricher \ No newline at end of file diff --git a/src/enrichers/enricher_screenshot.py b/src/enrichers/screenshot_enricher.py similarity index 100% rename from src/enrichers/enricher_screenshot.py rename to src/enrichers/screenshot_enricher.py diff --git a/src/feeders/__init__.py b/src/feeders/__init__.py index 9fb5942..b11cd50 100644 --- a/src/feeders/__init__.py +++ b/src/feeders/__init__.py @@ -1,2 +1,2 @@ from.feeder import Feeder -from .feeder_gsheet import GsheetsFeeder \ No newline at end of file +from .gsheet_feeder import GsheetsFeeder \ No newline at end of file diff --git a/src/feeders/feeder.py b/src/feeders/feeder.py index d930ba0..bccfab8 100644 --- a/src/feeders/feeder.py +++ b/src/feeders/feeder.py @@ -1,7 +1,7 @@ from __future__ import annotations from dataclasses import dataclass from abc import abstractmethod -# from metadata import Metadata +from metadata import Metadata from steps.step import Step @@ -17,7 +17,5 @@ class Feeder(Step): # only for code typing return Step.init(name, config, Feeder) - # def feed(self, item: Metadata) -> Metadata: pass - @abstractmethod - def __iter__(self) -> Feeder: return None \ No newline at end of file + def __iter__(self) -> Metadata: return None \ No newline at end of file diff --git a/src/feeders/feeder_gsheet.py b/src/feeders/gsheet_feeder.py similarity index 85% rename from src/feeders/feeder_gsheet.py rename to src/feeders/gsheet_feeder.py index ad28af1..b9389a2 100644 --- a/src/feeders/feeder_gsheet.py +++ b/src/feeders/gsheet_feeder.py @@ -1,16 +1,17 @@ -import json, gspread +import gspread # from metadata import Metadata from loguru import logger # from . import Enricher from feeders import Feeder +from metadata import Metadata from steps.gsheet import Gsheets from utils import GWorksheet class GsheetsFeeder(Gsheets, Feeder): - name = "gsheets_feeder" + name = "gsheet_feeder" def __init__(self, config: dict) -> None: # without this STEP.__init__ is not called @@ -35,7 +36,7 @@ class GsheetsFeeder(Gsheets, Feeder): } }) - def __iter__(self) -> str: + def __iter__(self) -> Metadata: sh = self.gsheets_client.open(self.sheet) for ii, wks in enumerate(sh.worksheets()): if not self.should_process_sheet(wks.title): @@ -52,17 +53,16 @@ class GsheetsFeeder(Gsheets, Feeder): for row in range(1 + self.header, gw.count_rows() + 1): url = gw.get_cell(row, 'url').strip() if not len(url): continue - # TODO: gsheet_db should check later if this is supposed to be archived - # static_status = gw.get_cell(row, 'status') - # status = gw.get_cell(row, 'status', fresh=static_status in ['', None] and url != '') - # All checks done - archival process starts here - yield url - logger.success(f'Finished worksheet {wks.title}') - # GWorksheet(self.sheet) - print(self.sheet) - for u in ["url1", "url2"]: - yield u + original_status = gw.get_cell(row, 'status') + status = gw.get_cell(row, 'status', fresh=original_status in ['', None]) + # TODO: custom status parser(?) aka should_retry_from_status + if status not in ['', None]: continue + + # All checks done - archival process starts here + yield Metadata().set_url(url).set("gsheet", {"row": row, "worksheet": gw}, True) + + logger.success(f'Finished worksheet {wks.title}') def should_process_sheet(self, sheet_name: str) -> bool: if len(self.allow_worksheets) and sheet_name not in self.allow_worksheets: diff --git a/src/metadata.py b/src/metadata.py index 8945e1a..90ca743 100644 --- a/src/metadata.py +++ b/src/metadata.py @@ -1,6 +1,6 @@ from __future__ import annotations -from ast import List +from ast import List, Set from typing import Any, Union, Dict from dataclasses import dataclass, field from datetime import datetime @@ -12,8 +12,14 @@ from media import Media @dataclass class Metadata: status: str = "" - metadata: Dict[str, Any] = field(default_factory=dict) + metadata: Dict[str, Any] = field(default_factory=dict) + tmp_keys: Set[str] = field(default_factory=set) # keys that are not to be saved in DBs media: List[Media] = field(default_factory=list) + rearchivable: bool = False + + # def __init__(self, url, metadata = {}) -> None: + # self.set_url(url) + # self.metadata = metadata def merge(self: Metadata, right: Metadata, overwrite_left=True) -> Metadata: """ @@ -21,6 +27,7 @@ class Metadata: """ if overwrite_left: self.status = right.status + self.rearchivable |= right.rearchivable for k, v in right.metadata.items(): assert k not in self.metadata or type(v) == type(self.get(k)) if type(v) not in [dict, list, set] or k not in self.metadata: @@ -33,8 +40,10 @@ class Metadata: return right.merge(self) return self - def set(self, key: str, val: Any) -> Metadata: + def set(self, key: str, val: Any, is_tmp=False) -> Metadata: + # if not self.metadata: self.metadata = {} self.metadata[key] = val + if is_tmp: self.tmp_keys.add(key) return self def get(self, key: str, default: Any = None, create_if_missing=False) -> Union[Metadata, str]: @@ -75,3 +84,12 @@ class Metadata: # # converts all metadata and data into JSON # return json.dumps(self.metadata) # #TODO: datetime is not serializable + + def cleanup(self) -> Metadata: + #TODO: refactor so it returns a JSON with all intended properties, except tmp_keys + # the code below leads to errors if database needs tmp_keys after they are removed + # """removes temporary metadata fields, ideally called after all ops except writing""" + # for tmp_key in self.tmp_keys: + # self.metadata.pop(tmp_key, None) + # self.tmp_keys = set() + pass diff --git a/src/orchestrator.py b/src/orchestrator.py index 2f33370..26baed1 100644 --- a/src/orchestrator.py +++ b/src/orchestrator.py @@ -5,8 +5,11 @@ from dataclasses import dataclass from archivers.archiver import Archiverv2 from enrichers.enricher import Enricher +from databases.database import Database from metadata import Metadata -import tempfile, time +import tempfile, time, traceback +from loguru import logger + """ how not to couple the different pieces of logic @@ -119,7 +122,7 @@ class ArchivingOrchestrator: # identify each formatter, storage, database, etc # self.feeder = Feeder.init(config.feeder, config.get(config.feeder)) - # Is it possible to overwrite config.yaml values? it could be useful: share config file and modify gsheets_feeder.sheet via CLI + # Is it possible to overwrite config.yaml values? it could be useful: share config file and modify gsheet_feeder.sheet via CLI # where does that update/processing happen? in config.py # reflection for Archiver to know wihch child classes it has? use Archiver.__subclasses__ # self.archivers = [ @@ -129,12 +132,12 @@ class ArchivingOrchestrator: self.feeder = config.feeder self.enrichers = config.enrichers self.archivers: List[Archiverv2] = config.archivers + self.databases: List[Database] = config.databases for a in self.archivers: a.setup() self.formatters = [] self.storages = [] - self.databases = [] # self.formatters = [ # Formatter.init(f, config) # for f in config.formatters @@ -154,51 +157,61 @@ class ArchivingOrchestrator: # assert len(archivers) > 1, "there needs to be at least one Archiver" def feed(self) -> list(Metadata): - for url in self.feeder: - print("ARCHIVING", url) - with tempfile.TemporaryDirectory(dir="./") as tmp_dir: - result = self.archive(url, tmp_dir) - print(type(result)) - print(result) - # print(result.as_json()) - print("holding on") - time.sleep(300) + for item in self.feeder: + print("ARCHIVING", item) + try: + with tempfile.TemporaryDirectory(dir="./") as tmp_dir: + item.set("tmp_dir", tmp_dir, True) + result = self.archive(item) + print(result) + except KeyboardInterrupt: + # catches keyboard interruptions to do a clean exit + logger.warning(f"caught interrupt on {item=}") + for d in self.databases: d.aborted(item) + exit() + except Exception as e: + logger.error(f'Got unexpected error on item {item}: {e}\n{traceback.format_exc()}') + for d in self.databases: d.failed(item) + + print("holding on 5min") + time.sleep(300) + # how does this handle the parameters like folder which can be different for each archiver? # the storage needs to know where to archive!! # solution: feeders have context: extra metadata that they can read or ignore, # all of it should have sensible defaults (eg: folder) # default feeder is a list with 1 element - def archive(self, url: str, tmp_dir: str) -> Union[Metadata, None]: - # TODO: - # url = clear_url(url) # should we save if they differ? - # result = Metadata(url=url) - result = Metadata() + def archive(self, result: Metadata) -> Union[Metadata, None]: + url = result.get_url() + # TODO: clean urls + for a in self.archivers: + url = a.clean_url(url) result.set_url(url) - result.set("tmp_dir", tmp_dir) - - should_archive = True - for d in self.databases: should_archive &= d.should_process(url) + # should_archive = False + # for d in self.databases: should_archive |= d.should_process(url) # should storages also be able to check? - for s in self.storages: should_archive &= s.should_process(url) + # for s in self.storages: should_archive |= s.should_process(url) - if not should_archive: - print("skipping") - return "skipping" + # if not should_archive: + # print("skipping") + # return "skipping" # signal to DB that archiving has started + # and propagate already archived if it exists + cached_result = None for d in self.databases: # are the databases to decide whether to archive? # they can simply return True by default, otherwise they can avoid duplicates. should this logic be more granular, for example on the archiver level: a tweet will not need be scraped twice, whereas an instagram profile might. the archiver could not decide from the link which parts to archive, # instagram profile example: it would always re-archive everything # maybe the database/storage could use a hash/key to decide if there's a need to re-archive - if d.should_process(url): - d.started(url) - elif d.exists(url): - return d.fetch(url) - else: - print("Skipping url") - return + d.started(result) + if (local_result := d.fetch(result)): + cached_result = (cached_result or Metadata()).merge(local_result) + if cached_result and not cached_result.rearchivable: + for d in self.databases: + d.done(cached_result) + return cached_result # vk, telethon, ... for a in self.archivers: @@ -209,6 +222,7 @@ class ArchivingOrchestrator: # this is where the Hashes come from, the place with access to all content # the archiver does not have access to storage result.merge(a.download(result)) + # TODO: fix logic if True or result.is_success(): break # what if an archiver returns multiple entries and one is to be part of HTMLgenerator? @@ -224,13 +238,14 @@ class ArchivingOrchestrator: for f in self.formatters: result.merge(f.format(result)) - # storages + # storage for s in self.storages: for m in result.media: - m.merge(s.store(m)) + result.merge(s.store(m)) # signal completion to databases (DBs, Google Sheets, CSV, ...) # a hash registration service could be one database: forensic archiving + result.cleanup() for d in self.databases: d.done(result) return result diff --git a/src/steps/step.py b/src/steps/step.py index 7a2135c..b512af7 100644 --- a/src/steps/step.py +++ b/src/steps/step.py @@ -14,7 +14,7 @@ class Step(ABC): def __init__(self, config: dict) -> None: # reads the configs into object properties # self.config = config[self.name] - for k, v in config[self.name].items(): + for k, v in config.get(self.name, {}).items(): self.__setattr__(k, v) @staticmethod diff --git a/src/storages/__init__.py b/src/storages/__init__.py index 96baaba..91ce148 100644 --- a/src/storages/__init__.py +++ b/src/storages/__init__.py @@ -4,4 +4,5 @@ from .local_storage import LocalStorage, LocalConfig from .s3_storage import S3Config, S3Storage from .gd_storage import GDConfig, GDStorage -from .storage import StorageV2 \ No newline at end of file +from .storage import StorageV2 +from .s3 import S3StorageV2 \ No newline at end of file From 1cdc006b2728d524d6558a8c0215d830632624c7 Mon Sep 17 00:00:00 2001 From: msramalho <19508417+msramalho@users.noreply.github.com> Date: Wed, 4 Jan 2023 18:02:44 +0000 Subject: [PATCH 030/190] s3 storaging + WIP gsheets DB --- src/configs/v2config.py | 2 ++ src/databases/gsheet_db.py | 36 +++++++++++++++++++++++++++++++++--- src/feeders/gsheet_feeder.py | 6 +++--- src/media.py | 7 +++---- src/metadata.py | 24 +++++++++++++++++++----- src/orchestrator.py | 17 ++++++++++------- src/steps/gsheet.py | 1 + src/storages/s3.py | 30 ++++++++++++++++-------------- src/storages/storage.py | 24 +++++++++++++++++++++++- 9 files changed, 110 insertions(+), 37 deletions(-) diff --git a/src/configs/v2config.py b/src/configs/v2config.py index b028b5e..7260d41 100644 --- a/src/configs/v2config.py +++ b/src/configs/v2config.py @@ -90,11 +90,13 @@ class ConfigV2: self.enrichers = [Enricher.init(e, self.config) for e in steps.get("enrichers", [])] self.archivers = [Archiverv2.init(e, self.config) for e in steps.get("archivers", [])] self.databases = [Database.init(e, self.config) for e in steps.get("databases", [])] + self.storages = [StorageV2.init(e, self.config) for e in steps.get("storages", [])] print("feeder", self.feeder) print("enrichers", [e for e in self.enrichers]) print("archivers", [e for e in self.archivers]) print("databases", [e for e in self.databases]) + print("storages", [e for e in self.storages]) def validate(self): pass diff --git a/src/databases/gsheet_db.py b/src/databases/gsheet_db.py index 939e851..a5e462f 100644 --- a/src/databases/gsheet_db.py +++ b/src/databases/gsheet_db.py @@ -1,5 +1,5 @@ from typing import Union, Tuple -import gspread +import gspread, datetime # from metadata import Metadata from loguru import logger @@ -7,6 +7,7 @@ from loguru import logger # from . import Enricher from databases import Database from metadata import Metadata +from media import Media from steps.gsheet import Gsheets from utils import GWorksheet @@ -48,8 +49,37 @@ class GsheetsDb(Database): """archival result ready - should be saved to DB""" logger.success(f"DONE {item}") gw, row = self._retrieve_gsheet(item) - self._safe_status_update(item, 'done') - pass + # self._safe_status_update(item, 'done') + + cell_updates = [] + row_values = gw.get_row(row) + + def batch_if_valid(col, val, final_value=None): + final_value = final_value or val + if val and gw.col_exists(col) and gw.get_cell(row_values, col) == '': + cell_updates.append((row, col, final_value)) + + cell_updates.append((row, 'status', item.status)) + + media: Media = item.get_single_media() + + batch_if_valid('archive', media.cdn_url) + batch_if_valid('date', True, datetime.datetime.utcnow().replace(tzinfo=datetime.timezone.utc).isoformat()) + batch_if_valid('title', item.get_title()) + batch_if_valid('text', item.get("content", "")[:500]) + batch_if_valid('timestamp', item.get_timestamp()) + + # TODO: AFTER ENRICHMENTS + # batch_if_valid('hash', media.hash) + # batch_if_valid('thumbnail', result.thumbnail, f'=IMAGE("{result.thumbnail}")') + # batch_if_valid('thumbnail_index', result.thumbnail_index) + # batch_if_valid('duration', result.duration, str(result.duration)) + # batch_if_valid('screenshot', result.screenshot) + # if result.wacz is not None: + # batch_if_valid('wacz', result.wacz) + # batch_if_valid('replaywebpage', f'https://replayweb.page/?source={quote(result.wacz)}#view=pages&url={quote(url)}') + + gw.batch_set_cell(cell_updates) def _safe_status_update(self, item: Metadata, new_status: str) -> None: try: diff --git a/src/feeders/gsheet_feeder.py b/src/feeders/gsheet_feeder.py index b9389a2..029813f 100644 --- a/src/feeders/gsheet_feeder.py +++ b/src/feeders/gsheet_feeder.py @@ -1,4 +1,4 @@ -import gspread +import gspread, os # from metadata import Metadata from loguru import logger @@ -8,7 +8,7 @@ from feeders import Feeder from metadata import Metadata from steps.gsheet import Gsheets from utils import GWorksheet - +from slugify import slugify class GsheetsFeeder(Gsheets, Feeder): name = "gsheet_feeder" @@ -60,7 +60,7 @@ class GsheetsFeeder(Gsheets, Feeder): if status not in ['', None]: continue # All checks done - archival process starts here - yield Metadata().set_url(url).set("gsheet", {"row": row, "worksheet": gw}, True) + yield Metadata().set_url(url).set("gsheet", {"row": row, "worksheet": gw}, True).set("folder", os.path.join(slugify(self.sheet), slugify(wks.title)), True) logger.success(f'Finished worksheet {wks.title}') diff --git a/src/media.py b/src/media.py index ecee4f4..c499b5b 100644 --- a/src/media.py +++ b/src/media.py @@ -10,8 +10,7 @@ import json @dataclass class Media: filename: str - id: str = None - hash: str = None + key: str = None cdn_url: str = None - hash: str = None - + # id: str = None + # hash: str = None # TODO: added by enrichers diff --git a/src/metadata.py b/src/metadata.py index 90ca743..f48c636 100644 --- a/src/metadata.py +++ b/src/metadata.py @@ -3,7 +3,7 @@ from __future__ import annotations from ast import List, Set from typing import Any, Union, Dict from dataclasses import dataclass, field -from datetime import datetime +import datetime # import json from media import Media @@ -70,26 +70,40 @@ class Metadata: def set_title(self, title: str) -> Metadata: return self.set("title", title) - def set_timestamp(self, timestamp: datetime) -> Metadata: - assert type(timestamp) == datetime, "set_timestamp expects a datetime instance" + def get_title(self) -> str: + return self.get("title") + + def set_timestamp(self, timestamp: datetime.datetime) -> Metadata: + assert type(timestamp) == datetime.datetime, "set_timestamp expects a datetime instance" return self.set("timestamp", timestamp) + def get_timestamp(self, utc=True, iso=True) -> datetime.datetime: + ts = self.get("timestamp") + if not ts: return ts + if utc: ts = ts.replace(tzinfo=datetime.timezone.utc) + if iso: return ts.isoformat() + return ts + def add_media(self, media: Media) -> Metadata: # print(f"adding {filename} to {self.metadata.get('media')}") # return self.set("media", self.get_media() + [filename]) # return self.get_media().append(media) return self.media.append(media) + def get_single_media(self) -> Media: + # TODO: check if formatters were applied and choose with priority + return self.media[0] + # def as_json(self) -> str: # # converts all metadata and data into JSON # return json.dumps(self.metadata) # #TODO: datetime is not serializable def cleanup(self) -> Metadata: - #TODO: refactor so it returns a JSON with all intended properties, except tmp_keys + # TODO: refactor so it returns a JSON with all intended properties, except tmp_keys # the code below leads to errors if database needs tmp_keys after they are removed # """removes temporary metadata fields, ideally called after all ops except writing""" # for tmp_key in self.tmp_keys: - # self.metadata.pop(tmp_key, None) + # self.metadata.pop(tmp_key, None) # self.tmp_keys = set() pass diff --git a/src/orchestrator.py b/src/orchestrator.py index 26baed1..3bc5ea7 100644 --- a/src/orchestrator.py +++ b/src/orchestrator.py @@ -2,15 +2,18 @@ from __future__ import annotations from ast import List from typing import Union, Dict from dataclasses import dataclass -from archivers.archiver import Archiverv2 -from enrichers.enricher import Enricher -from databases.database import Database +from archivers import Archiverv2 +from storages import StorageV2 +from enrichers import Enricher +from databases import Database from metadata import Metadata + import tempfile, time, traceback from loguru import logger + """ how not to couple the different pieces of logic due to the use of constants for the metadata keys? @@ -133,11 +136,11 @@ class ArchivingOrchestrator: self.enrichers = config.enrichers self.archivers: List[Archiverv2] = config.archivers self.databases: List[Database] = config.databases + self.storages: List[StorageV2] = config.storages for a in self.archivers: a.setup() self.formatters = [] - self.storages = [] # self.formatters = [ # Formatter.init(f, config) # for f in config.formatters @@ -184,7 +187,7 @@ class ArchivingOrchestrator: def archive(self, result: Metadata) -> Union[Metadata, None]: url = result.get_url() - # TODO: clean urls + # TODO: clean urls for a in self.archivers: url = a.clean_url(url) result.set_url(url) @@ -240,8 +243,8 @@ class ArchivingOrchestrator: # storage for s in self.storages: - for m in result.media: - result.merge(s.store(m)) + for i, m in enumerate(result.media): + result.media[i] = s.store(m, result) # signal completion to databases (DBs, Google Sheets, CSV, ...) # a hash registration service could be one database: forensic archiving diff --git a/src/steps/gsheet.py b/src/steps/gsheet.py index 279c036..6bfb5d7 100644 --- a/src/steps/gsheet.py +++ b/src/steps/gsheet.py @@ -30,6 +30,7 @@ class Gsheets(Step): 'thumbnail_index': 'thumbnail index', 'timestamp': 'upload timestamp', 'title': 'upload title', + 'text': 'text content', 'duration': 'duration', 'screenshot': 'screenshot', 'hash': 'hash', diff --git a/src/storages/s3.py b/src/storages/s3.py index 826d66d..d4457e8 100644 --- a/src/storages/s3.py +++ b/src/storages/s3.py @@ -1,8 +1,10 @@ -from typing import IO +from typing import IO, Any import boto3, uuid, os, mimetypes from botocore.errorfactory import ClientError -from src.storages import StorageV2 +from metadata import Metadata +from media import Media +from storages import StorageV2 from loguru import logger from slugify import slugify @@ -14,10 +16,10 @@ class S3StorageV2(StorageV2): super().__init__(config) self.s3 = boto3.client( 's3', - region_name=config.region, - endpoint_url=config.endpoint_url.format(region=config.region), - aws_access_key_id=config.key, - aws_secret_access_key=config.secret + region_name=self.region, + endpoint_url=self.endpoint_url.format(region=self.region), + aws_access_key_id=self.key, + aws_secret_access_key=self.secret ) @staticmethod @@ -37,31 +39,31 @@ class S3StorageV2(StorageV2): "help": "S3 CDN url, {bucket}, {region} and {key} are inserted at runtime" }, "private": {"default": False, "help": "if true S3 files will not be readable online"}, - "key_path": {"default": "random", "help": "S3 file names are non-predictable strings, one of ['random', 'default']"}, + # "key_path": {"default": "random", "help": "S3 file names are non-predictable strings, one of ['random', 'default']"}, } - def get_cdn_url(self, key: str) -> str: - return self.cdn_url.format(bucket=self.bucket, region=self.region, key=self._get_path(key)) + def get_cdn_url(self, media: Media) -> str: + return self.cdn_url.format(bucket=self.bucket, region=self.region, key=media.key) - def uploadf(self, file: IO[bytes], key: str, **kwargs: dict) -> None: + def uploadf(self, file: IO[bytes], media: Media, **kwargs: dict) -> Any: extra_args = kwargs.get("extra_args", {}) if not self.private and 'ACL' not in extra_args: extra_args['ACL'] = 'public-read' if 'ContentType' not in extra_args: try: - extra_args['ContentType'] = mimetypes.guess_type(key)[0] + extra_args['ContentType'] = mimetypes.guess_type(media.key)[0] except Exception as e: - logger.error(f"Unable to get mimetype for {key=}, error: {e}") + logger.warning(f"Unable to get mimetype for {media.key=}, error: {e}") - self.s3.upload_fileobj(file, Bucket=self.bucket, Key=self._get_path(key), ExtraArgs=extra_args) + self.s3.upload_fileobj(file, Bucket=self.bucket, Key=media.key, ExtraArgs=extra_args) def exists(self, key: str) -> bool: """ Tests if a given file with key=key exists in the bucket """ try: - self.s3.head_object(Bucket=self.bucket, Key=self._get_path(key)) + self.s3.head_object(Bucket=self.bucket, Key=key) return True except ClientError as e: logger.warning(f"got a ClientError when checking if {key=} exists in bucket={self.bucket}: {e}") diff --git a/src/storages/storage.py b/src/storages/storage.py index 4052d7e..06346e9 100644 --- a/src/storages/storage.py +++ b/src/storages/storage.py @@ -1,8 +1,12 @@ from __future__ import annotations from abc import abstractmethod from dataclasses import dataclass +from typing import IO, Any +from media import Media from metadata import Metadata from steps.step import Step +from loguru import logger +import os, uuid @dataclass @@ -17,5 +21,23 @@ class StorageV2(Step): def init(name: str, config: dict) -> StorageV2: return Step.init(name, config, StorageV2) + def store(self, media: Media, item: Metadata) -> Media: + media = self.set_key(media, item) + self.upload(media) + media.cdn_url = self.get_cdn_url(media) + return media + @abstractmethod - def store(self, item: Metadata) -> Metadata: pass + def uploadf(self, file: IO[bytes], key: str, **kwargs: dict) -> Any: pass + + def upload(self, media: Media, **kwargs) -> Any: + logger.debug(f'[{self.__class__.name}] uploading file {media.filename} with key {media.key}') + with open(media.filename, 'rb') as f: + return self.uploadf(f, media, **kwargs) + + def set_key(self, media: Media, item: Metadata) -> Media: + """takes the media and optionally item info and generates a key""" + folder = item.get("folder", "") + ext = os.path.splitext(media.filename)[1] + media.key = os.path.join(folder, f"{str(uuid.uuid4())}{ext}") + return media From aac16fa8c2946e75e63cc861649bcdb675fbc860 Mon Sep 17 00:00:00 2001 From: msramalho <19508417+msramalho@users.noreply.github.com> Date: Mon, 9 Jan 2023 22:24:44 +0000 Subject: [PATCH 031/190] minor comments --- src/databases/gsheet_db.py | 1 + src/steps/step.py | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/src/databases/gsheet_db.py b/src/databases/gsheet_db.py index a5e462f..ba3785a 100644 --- a/src/databases/gsheet_db.py +++ b/src/databases/gsheet_db.py @@ -91,4 +91,5 @@ class GsheetsDb(Database): def _retrieve_gsheet(self, item: Metadata) -> Tuple[GWorksheet, int]: gw: GWorksheet = item.get("gsheet").get("worksheet") row: int = item.get("gsheet").get("row") + #TODO: to make gsheet_db less coupled with gsheet_feeder's "gsheet" parameter, this method could 1st try to fetch "gsheet" from item and, if missing, manage its own singleton - not needed for now return gw, row diff --git a/src/steps/step.py b/src/steps/step.py index b512af7..a8bad38 100644 --- a/src/steps/step.py +++ b/src/steps/step.py @@ -28,4 +28,4 @@ class Step(ABC): for sub in child.__subclasses__(): if sub.name == name: return sub(config) - raise ClassFoundException(f"Unable to initialize STEP with {name=}") + raise ClassFoundException(f"Unable to initialize STEP with {name=}, check your configuration file/step names.") From d4825196f13e8037cbd9005fa4d630d17f42c0ef Mon Sep 17 00:00:00 2001 From: msramalho <19508417+msramalho@users.noreply.github.com> Date: Tue, 10 Jan 2023 00:22:16 +0000 Subject: [PATCH 032/190] html template working with jinja templates --- Pipfile | 2 + Pipfile.lock | 299 ++++++++++---------- src/configs/v2config.py | 9 +- src/formatters/__init__.py | 2 + src/formatters/formatter.py | 21 ++ src/formatters/html_formatter.py | 36 +++ src/formatters/templates/html_template.html | 101 +++++++ src/media.py | 9 +- src/metadata.py | 26 +- src/orchestrator.py | 27 +- src/steps/gsheet.py | 1 + 11 files changed, 369 insertions(+), 164 deletions(-) create mode 100644 src/formatters/__init__.py create mode 100644 src/formatters/formatter.py create mode 100644 src/formatters/html_formatter.py create mode 100644 src/formatters/templates/html_template.html diff --git a/Pipfile b/Pipfile index 2095f2b..d79388d 100644 --- a/Pipfile +++ b/Pipfile @@ -27,6 +27,8 @@ vk-url-scraper = "*" python-twitter-v2 = "*" instaloader = "*" tqdm = "*" +jinja2 = "*" +cryptography = "==38.0.4" [requires] python_version = "3.9" diff --git a/Pipfile.lock b/Pipfile.lock index 5bfeba7..83e2607 100644 --- a/Pipfile.lock +++ b/Pipfile.lock @@ -1,7 +1,7 @@ { "_meta": { "hash": { - "sha256": "60b8f39d7a466e194c98a3fb6a03f74f03b108f5fac4cce8657c5ffdf6a02962" + "sha256": "bcc36e9ecdf6d383a1010629484eec271699ac23b40be045d9a9669b4c9fac8c" }, "pipfile-spec": 6, "requires": { @@ -34,11 +34,11 @@ }, "attrs": { "hashes": [ - "sha256:29adc2665447e5191d0e7c568fde78b21f9672d344281d0c6e1ab085429b22b6", - "sha256:86efa402f67bf2df34f51a335487cf46b1ec130d02b8d39fd248abfd30da551c" + "sha256:29e95c7f6778868dbd49170f98f8818f78f3dc5e0e37c0b1f474e3561b240836", + "sha256:c9227bfc2f01993c03f68db37d1d15c9690188323c067c641f1a35ca58185f99" ], - "markers": "python_version >= '3.5'", - "version": "==22.1.0" + "markers": "python_version >= '3.6'", + "version": "==22.2.0" }, "authlib": { "hashes": [ @@ -57,19 +57,19 @@ }, "boto3": { "hashes": [ - "sha256:53badfc5f145b8a3f9117512b41bc5a64db1cce1b549061d8edba68909e63fdf", - "sha256:548081a0f8854bb2eea1e368ab29945478105f56989546f653c75528dcb07d88" + "sha256:96055651f7be882175aa334ad46528e1ad79fb8ca33fa9c3998cc1d985b34eab", + "sha256:e24d65c31780c208768ebcd152d8a0181591c9c8e7d971e23f318d7f41910ba1" ], "index": "pypi", - "version": "==1.26.28" + "version": "==1.26.46" }, "botocore": { "hashes": [ - "sha256:982732e7ed65cb6ed11ea3ce0e32dff2bcd465836c32376154f0802aa0a112c7", - "sha256:f0b8bb976e368dea20a960b47169e31fc0828feb6f0b9f59f1e5be8d08919b10" + "sha256:78bf25933e35eb6354a9e80fe156f86dce4d346a92afe364dfce25c17ab0639f", + "sha256:dbac2fde265f13beb9191ec3ff63b90b515e9ed63875edc3afbd72c5f585e48b" ], "markers": "python_version >= '3.7'", - "version": "==1.29.28" + "version": "==1.29.46" }, "brotli": { "hashes": [ @@ -168,11 +168,11 @@ }, "cachetools": { "hashes": [ - "sha256:6a94c6402995a99c3970cc7e4884bb60b4a8639938157eeed436098bf9831757", - "sha256:f9f17d2aec496a9aa6b76f53e3b614c965223c061982d434d160f930c698a9db" + "sha256:5991bc0e08a1319bb618d3195ca5b6bc76646a49c21d55962977197b301cc1fe", + "sha256:8462eebf3a6c15d25430a8c27c56ac61340b2ecf60c9ce57afc2b97e450e47da" ], "markers": "python_version ~= '3.7'", - "version": "==5.2.0" + "version": "==5.2.1" }, "certifi": { "hashes": [ @@ -269,10 +269,10 @@ }, "cloudscraper": { "hashes": [ - "sha256:5f0cde23774270e8a092de68e0fbd68e17854c767fc2d4042a91bda9e4816871", - "sha256:ec30da6cee60d0a95e898d9b3aaf09291a0d8b6cf751e86c6f3420b699a00091" + "sha256:2776c70f3661c028e59fd306ac2b104882c9b3cb3f798086251e00fc2d72c3a2", + "sha256:3b9753724616ac4d811e7922ddc9dba9b4419749ebaa35b0ba503d442522df2e" ], - "version": "==1.2.66" + "version": "==1.2.67" }, "commonmark": { "hashes": [ @@ -310,7 +310,7 @@ "sha256:ca57eb3ddaccd1112c18fc80abe41db443cc2e9dcb1917078e02dfa010a4f353", "sha256:ce127dd0a6a0811c251a6cddd014d292728484e530d80e872ad9806cfb1c5b3c" ], - "markers": "python_version >= '3.6'", + "index": "pypi", "version": "==38.0.4" }, "dataclasses-json": { @@ -323,19 +323,19 @@ }, "dateparser": { "hashes": [ - "sha256:4431159799b63d8acec5d7d844c5e06edf3d1b0eb2bda6d4cac87134ddddd01c", - "sha256:73ec6e44a133c54076ecf9f9dc0fbe3dd4831f154f977ff06f53114d57c5425e" + "sha256:107f3cc87a60770e10d111349adc1504224a6b60753a47a64b0ec842ab85b5a9", + "sha256:ceb159f1b4a9df54ed6209e91298097deafde476037f8611b4cb2b1cb8b31c58" ], "index": "pypi", - "version": "==1.1.4" + "version": "==1.1.5" }, "exceptiongroup": { "hashes": [ - "sha256:542adf9dea4055530d6e1279602fa5cb11dab2395fa650b8674eaec35fc4a828", - "sha256:bd14967b79cd9bdb54d97323216f8fdf533e278df937aa2a90089e7d6e06e5ec" + "sha256:327cbda3da756e2de031a3107b81ab7b3770a602c4d16ca618298c526f4bec1e", + "sha256:bcb67d800a4497e1b404c2dd44fca47d3b7a5e5433dbab67f96c1a685cdfdf23" ], "markers": "python_version < '3.11'", - "version": "==1.0.4" + "version": "==1.1.0" }, "ffmpeg-python": { "hashes": [ @@ -347,11 +347,11 @@ }, "filelock": { "hashes": [ - "sha256:7565f628ea56bfcd8e54e42bdc55da899c85c1abfe1b5bcfd147e9188cebb3b2", - "sha256:8df285554452285f79c035efb0c861eb33a4bcfa5b7a137016e32e6a90f9792c" + "sha256:7b319f24340b51f55a2bf7a12ac0755a9b03e718311dac567a0f4f7fabd2f5de", + "sha256:f58d535af89bb9ad5cd4df046f741f8553a418c01a7856bf0d173bbc9f6bd16d" ], "markers": "python_version >= '3.7'", - "version": "==3.8.2" + "version": "==3.9.0" }, "flask": { "hashes": [ @@ -378,19 +378,19 @@ }, "google-api-python-client": { "hashes": [ - "sha256:03624a28b5ba94f3c3d44761081f5dbf8cabaa20c5c3a96c046457c5713efb9b", - "sha256:bc2447a7479006d98927fb20faa74d892d3758ff68e99b621367632bc42b8af8" + "sha256:9412ad3445518fa9d24d02c673a70b07c9d124990f44763cdf4f5304ca5b4d08", + "sha256:a4ea351db2bb2a9b1a7e96d8fa8de0fcbc31d9e237b724f4a07b243c2d63e9a4" ], "index": "pypi", - "version": "==2.69.0" + "version": "==2.71.0" }, "google-auth": { "hashes": [ - "sha256:6897b93556d8d807ad70701bb89f000183aea366ca7ed94680828b37437a4994", - "sha256:72f12a6cfc968d754d7bdab369c5c5c16032106e52d32c6dfd8484e4c01a6d1f" + "sha256:5045648c821fb72384cdc0e82cc326df195f113a33049d9b62b74589243d2acc", + "sha256:ed7057a101af1146f0554a769930ac9de506aeca4fd5af6543ebe791851a9fbd" ], "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4, 3.5'", - "version": "==2.15.0" + "version": "==2.16.0" }, "google-auth-httplib2": { "hashes": [ @@ -410,11 +410,11 @@ }, "googleapis-common-protos": { "hashes": [ - "sha256:27a849d6205838fb6cc3c1c21cb9800707a661bb21c6ce7fb13e99eb1f8a0c46", - "sha256:a9f4a1d7f6d9809657b7f1316a1aa527f6664891531bcfcc13b6696e685f443c" + "sha256:c727251ec025947d545184ba17e3578840fc3a24a0516a020479edab660457df", + "sha256:ca3befcd4580dab6ad49356b46bf165bb68ff4b32389f028f1abd7c10ab9519a" ], "markers": "python_version >= '3.7'", - "version": "==1.57.0" + "version": "==1.58.0" }, "gspread": { "hashes": [ @@ -468,7 +468,7 @@ "sha256:31351a702a408a9e7595a8fc6150fc3f43bb6bf7e319770cbc0db9df9437e852", "sha256:6088930bfe239f0e6710546ab9c19c9ef35e29792895fed6e6e31a023a182a61" ], - "markers": "python_version >= '3.7'", + "index": "pypi", "version": "==3.1.2" }, "jmespath": { @@ -489,79 +489,86 @@ }, "lxml": { "hashes": [ - "sha256:04da965dfebb5dac2619cb90fcf93efdb35b3c6994fea58a157a834f2f94b318", - "sha256:0538747a9d7827ce3e16a8fdd201a99e661c7dee3c96c885d8ecba3c35d1032c", - "sha256:0645e934e940107e2fdbe7c5b6fb8ec6232444260752598bc4d09511bd056c0b", - "sha256:079b68f197c796e42aa80b1f739f058dcee796dc725cc9a1be0cdb08fc45b000", - "sha256:0f3f0059891d3254c7b5fb935330d6db38d6519ecd238ca4fce93c234b4a0f73", - "sha256:10d2017f9150248563bb579cd0d07c61c58da85c922b780060dcc9a3aa9f432d", - "sha256:1355755b62c28950f9ce123c7a41460ed9743c699905cbe664a5bcc5c9c7c7fb", - "sha256:13c90064b224e10c14dcdf8086688d3f0e612db53766e7478d7754703295c7c8", - "sha256:1423631e3d51008871299525b541413c9b6c6423593e89f9c4cfbe8460afc0a2", - "sha256:1436cf0063bba7888e43f1ba8d58824f085410ea2025befe81150aceb123e345", - "sha256:1a7c59c6ffd6ef5db362b798f350e24ab2cfa5700d53ac6681918f314a4d3b94", - "sha256:1e1cf47774373777936c5aabad489fef7b1c087dcd1f426b621fda9dcc12994e", - "sha256:206a51077773c6c5d2ce1991327cda719063a47adc02bd703c56a662cdb6c58b", - "sha256:21fb3d24ab430fc538a96e9fbb9b150029914805d551deeac7d7822f64631dfc", - "sha256:27e590352c76156f50f538dbcebd1925317a0f70540f7dc8c97d2931c595783a", - "sha256:287605bede6bd36e930577c5925fcea17cb30453d96a7b4c63c14a257118dbb9", - "sha256:2aaf6a0a6465d39b5ca69688fce82d20088c1838534982996ec46633dc7ad6cc", - "sha256:32a73c53783becdb7eaf75a2a1525ea8e49379fb7248c3eeefb9412123536387", - "sha256:41fb58868b816c202e8881fd0f179a4644ce6e7cbbb248ef0283a34b73ec73bb", - "sha256:4780677767dd52b99f0af1f123bc2c22873d30b474aa0e2fc3fe5e02217687c7", - "sha256:4878e667ebabe9b65e785ac8da4d48886fe81193a84bbe49f12acff8f7a383a4", - "sha256:487c8e61d7acc50b8be82bda8c8d21d20e133c3cbf41bd8ad7eb1aaeb3f07c97", - "sha256:4beea0f31491bc086991b97517b9683e5cfb369205dac0148ef685ac12a20a67", - "sha256:4cfbe42c686f33944e12f45a27d25a492cc0e43e1dc1da5d6a87cbcaf2e95627", - "sha256:4d5bae0a37af799207140652a700f21a85946f107a199bcb06720b13a4f1f0b7", - "sha256:4e285b5f2bf321fc0857b491b5028c5f276ec0c873b985d58d7748ece1d770dd", - "sha256:57e4d637258703d14171b54203fd6822fda218c6c2658a7d30816b10995f29f3", - "sha256:5974895115737a74a00b321e339b9c3f45c20275d226398ae79ac008d908bff7", - "sha256:5ef87fca280fb15342726bd5f980f6faf8b84a5287fcc2d4962ea8af88b35130", - "sha256:603a464c2e67d8a546ddaa206d98e3246e5db05594b97db844c2f0a1af37cf5b", - "sha256:6653071f4f9bac46fbc30f3c7838b0e9063ee335908c5d61fb7a4a86c8fd2036", - "sha256:6ca2264f341dd81e41f3fffecec6e446aa2121e0b8d026fb5130e02de1402785", - "sha256:6d279033bf614953c3fc4a0aa9ac33a21e8044ca72d4fa8b9273fe75359d5cca", - "sha256:6d949f53ad4fc7cf02c44d6678e7ff05ec5f5552b235b9e136bd52e9bf730b91", - "sha256:6daa662aba22ef3258934105be2dd9afa5bb45748f4f702a3b39a5bf53a1f4dc", - "sha256:6eafc048ea3f1b3c136c71a86db393be36b5b3d9c87b1c25204e7d397cee9536", - "sha256:830c88747dce8a3e7525defa68afd742b4580df6aa2fdd6f0855481e3994d391", - "sha256:86e92728ef3fc842c50a5cb1d5ba2bc66db7da08a7af53fb3da79e202d1b2cd3", - "sha256:8caf4d16b31961e964c62194ea3e26a0e9561cdf72eecb1781458b67ec83423d", - "sha256:8d1a92d8e90b286d491e5626af53afef2ba04da33e82e30744795c71880eaa21", - "sha256:8f0a4d179c9a941eb80c3a63cdb495e539e064f8054230844dcf2fcb812b71d3", - "sha256:9232b09f5efee6a495a99ae6824881940d6447debe272ea400c02e3b68aad85d", - "sha256:927a9dd016d6033bc12e0bf5dee1dde140235fc8d0d51099353c76081c03dc29", - "sha256:93e414e3206779ef41e5ff2448067213febf260ba747fc65389a3ddaa3fb8715", - "sha256:98cafc618614d72b02185ac583c6f7796202062c41d2eeecdf07820bad3295ed", - "sha256:9c3a88d20e4fe4a2a4a84bf439a5ac9c9aba400b85244c63a1ab7088f85d9d25", - "sha256:9f36de4cd0c262dd9927886cc2305aa3f2210db437aa4fed3fb4940b8bf4592c", - "sha256:a60f90bba4c37962cbf210f0188ecca87daafdf60271f4c6948606e4dabf8785", - "sha256:a614e4afed58c14254e67862456d212c4dcceebab2eaa44d627c2ca04bf86837", - "sha256:ae06c1e4bc60ee076292e582a7512f304abdf6c70db59b56745cca1684f875a4", - "sha256:b122a188cd292c4d2fcd78d04f863b789ef43aa129b233d7c9004de08693728b", - "sha256:b570da8cd0012f4af9fa76a5635cd31f707473e65a5a335b186069d5c7121ff2", - "sha256:bcaa1c495ce623966d9fc8a187da80082334236a2a1c7e141763ffaf7a405067", - "sha256:bd34f6d1810d9354dc7e35158aa6cc33456be7706df4420819af6ed966e85448", - "sha256:be9eb06489bc975c38706902cbc6888f39e946b81383abc2838d186f0e8b6a9d", - "sha256:c4b2e0559b68455c085fb0f6178e9752c4be3bba104d6e881eb5573b399d1eb2", - "sha256:c62e8dd9754b7debda0c5ba59d34509c4688f853588d75b53c3791983faa96fc", - "sha256:c852b1530083a620cb0de5f3cd6826f19862bafeaf77586f1aef326e49d95f0c", - "sha256:d9fc0bf3ff86c17348dfc5d322f627d78273eba545db865c3cd14b3f19e57fa5", - "sha256:dad7b164905d3e534883281c050180afcf1e230c3d4a54e8038aa5cfcf312b84", - "sha256:e5f66bdf0976ec667fc4594d2812a00b07ed14d1b44259d19a41ae3fff99f2b8", - "sha256:e8f0c9d65da595cfe91713bc1222af9ecabd37971762cb830dea2fc3b3bb2acf", - "sha256:edffbe3c510d8f4bf8640e02ca019e48a9b72357318383ca60e3330c23aaffc7", - "sha256:eea5d6443b093e1545ad0210e6cf27f920482bfcf5c77cdc8596aec73523bb7e", - "sha256:ef72013e20dd5ba86a8ae1aed7f56f31d3374189aa8b433e7b12ad182c0d2dfb", - "sha256:f05251bbc2145349b8d0b77c0d4e5f3b228418807b1ee27cefb11f69ed3d233b", - "sha256:f1be258c4d3dc609e654a1dc59d37b17d7fef05df912c01fc2e15eb43a9735f3", - "sha256:f9ced82717c7ec65a67667bb05865ffe38af0e835cdd78728f1209c8fffe0cad", - "sha256:fe17d10b97fdf58155f858606bddb4e037b805a60ae023c009f760d8361a4eb8", - "sha256:fe749b052bb7233fe5d072fcb549221a8cb1a16725c47c37e42b0b9cb3ff2c3f" + "sha256:01d36c05f4afb8f7c20fd9ed5badca32a2029b93b1750f571ccc0b142531caf7", + "sha256:04876580c050a8c5341d706dd464ff04fd597095cc8c023252566a8826505726", + "sha256:05ca3f6abf5cf78fe053da9b1166e062ade3fa5d4f92b4ed688127ea7d7b1d03", + "sha256:090c6543d3696cbe15b4ac6e175e576bcc3f1ccfbba970061b7300b0c15a2140", + "sha256:0dc313ef231edf866912e9d8f5a042ddab56c752619e92dfd3a2c277e6a7299a", + "sha256:0f2b1e0d79180f344ff9f321327b005ca043a50ece8713de61d1cb383fb8ac05", + "sha256:13598ecfbd2e86ea7ae45ec28a2a54fb87ee9b9fdb0f6d343297d8e548392c03", + "sha256:16efd54337136e8cd72fb9485c368d91d77a47ee2d42b057564aae201257d419", + "sha256:1ab8f1f932e8f82355e75dda5413a57612c6ea448069d4fb2e217e9a4bed13d4", + "sha256:223f4232855ade399bd409331e6ca70fb5578efef22cf4069a6090acc0f53c0e", + "sha256:2455cfaeb7ac70338b3257f41e21f0724f4b5b0c0e7702da67ee6c3640835b67", + "sha256:2899456259589aa38bfb018c364d6ae7b53c5c22d8e27d0ec7609c2a1ff78b50", + "sha256:2a29ba94d065945944016b6b74e538bdb1751a1db6ffb80c9d3c2e40d6fa9894", + "sha256:2a87fa548561d2f4643c99cd13131acb607ddabb70682dcf1dff5f71f781a4bf", + "sha256:2e430cd2824f05f2d4f687701144556646bae8f249fd60aa1e4c768ba7018947", + "sha256:36c3c175d34652a35475a73762b545f4527aec044910a651d2bf50de9c3352b1", + "sha256:3818b8e2c4b5148567e1b09ce739006acfaa44ce3156f8cbbc11062994b8e8dd", + "sha256:3ab9fa9d6dc2a7f29d7affdf3edebf6ece6fb28a6d80b14c3b2fb9d39b9322c3", + "sha256:3efea981d956a6f7173b4659849f55081867cf897e719f57383698af6f618a92", + "sha256:4c8f293f14abc8fd3e8e01c5bd86e6ed0b6ef71936ded5bf10fe7a5efefbaca3", + "sha256:5344a43228767f53a9df6e5b253f8cdca7dfc7b7aeae52551958192f56d98457", + "sha256:58bfa3aa19ca4c0f28c5dde0ff56c520fbac6f0daf4fac66ed4c8d2fb7f22e74", + "sha256:5b4545b8a40478183ac06c073e81a5ce4cf01bf1734962577cf2bb569a5b3bbf", + "sha256:5f50a1c177e2fa3ee0667a5ab79fdc6b23086bc8b589d90b93b4bd17eb0e64d1", + "sha256:63da2ccc0857c311d764e7d3d90f429c252e83b52d1f8f1d1fe55be26827d1f4", + "sha256:6749649eecd6a9871cae297bffa4ee76f90b4504a2a2ab528d9ebe912b101975", + "sha256:6804daeb7ef69e7b36f76caddb85cccd63d0c56dedb47555d2fc969e2af6a1a5", + "sha256:689bb688a1db722485e4610a503e3e9210dcc20c520b45ac8f7533c837be76fe", + "sha256:699a9af7dffaf67deeae27b2112aa06b41c370d5e7633e0ee0aea2e0b6c211f7", + "sha256:6b418afe5df18233fc6b6093deb82a32895b6bb0b1155c2cdb05203f583053f1", + "sha256:76cf573e5a365e790396a5cc2b909812633409306c6531a6877c59061e42c4f2", + "sha256:7b515674acfdcadb0eb5d00d8a709868173acece5cb0be3dd165950cbfdf5409", + "sha256:7b770ed79542ed52c519119473898198761d78beb24b107acf3ad65deae61f1f", + "sha256:7d2278d59425777cfcb19735018d897ca8303abe67cc735f9f97177ceff8027f", + "sha256:7e91ee82f4199af8c43d8158024cbdff3d931df350252288f0d4ce656df7f3b5", + "sha256:821b7f59b99551c69c85a6039c65b75f5683bdc63270fec660f75da67469ca24", + "sha256:822068f85e12a6e292803e112ab876bc03ed1f03dddb80154c395f891ca6b31e", + "sha256:8340225bd5e7a701c0fa98284c849c9b9fc9238abf53a0ebd90900f25d39a4e4", + "sha256:85cabf64adec449132e55616e7ca3e1000ab449d1d0f9d7f83146ed5bdcb6d8a", + "sha256:880bbbcbe2fca64e2f4d8e04db47bcdf504936fa2b33933efd945e1b429bea8c", + "sha256:8d0b4612b66ff5d62d03bcaa043bb018f74dfea51184e53f067e6fdcba4bd8de", + "sha256:8e20cb5a47247e383cf4ff523205060991021233ebd6f924bca927fcf25cf86f", + "sha256:925073b2fe14ab9b87e73f9a5fde6ce6392da430f3004d8b72cc86f746f5163b", + "sha256:998c7c41910666d2976928c38ea96a70d1aa43be6fe502f21a651e17483a43c5", + "sha256:9b22c5c66f67ae00c0199f6055705bc3eb3fcb08d03d2ec4059a2b1b25ed48d7", + "sha256:9f102706d0ca011de571de32c3247c6476b55bb6bc65a20f682f000b07a4852a", + "sha256:a08cff61517ee26cb56f1e949cca38caabe9ea9fbb4b1e10a805dc39844b7d5c", + "sha256:a0a336d6d3e8b234a3aae3c674873d8f0e720b76bc1d9416866c41cd9500ffb9", + "sha256:a35f8b7fa99f90dd2f5dc5a9fa12332642f087a7641289ca6c40d6e1a2637d8e", + "sha256:a38486985ca49cfa574a507e7a2215c0c780fd1778bb6290c21193b7211702ab", + "sha256:a5da296eb617d18e497bcf0a5c528f5d3b18dadb3619fbdadf4ed2356ef8d941", + "sha256:a6e441a86553c310258aca15d1c05903aaf4965b23f3bc2d55f200804e005ee5", + "sha256:a82d05da00a58b8e4c0008edbc8a4b6ec5a4bc1e2ee0fb6ed157cf634ed7fa45", + "sha256:ab323679b8b3030000f2be63e22cdeea5b47ee0abd2d6a1dc0c8103ddaa56cd7", + "sha256:b1f42b6921d0e81b1bcb5e395bc091a70f41c4d4e55ba99c6da2b31626c44892", + "sha256:b23e19989c355ca854276178a0463951a653309fb8e57ce674497f2d9f208746", + "sha256:b264171e3143d842ded311b7dccd46ff9ef34247129ff5bf5066123c55c2431c", + "sha256:b26a29f0b7fc6f0897f043ca366142d2b609dc60756ee6e4e90b5f762c6adc53", + "sha256:b64d891da92e232c36976c80ed7ebb383e3f148489796d8d31a5b6a677825efe", + "sha256:b9cc34af337a97d470040f99ba4282f6e6bac88407d021688a5d585e44a23184", + "sha256:bc718cd47b765e790eecb74d044cc8d37d58562f6c314ee9484df26276d36a38", + "sha256:be7292c55101e22f2a3d4d8913944cbea71eea90792bf914add27454a13905df", + "sha256:c83203addf554215463b59f6399835201999b5e48019dc17f182ed5ad87205c9", + "sha256:c9ec3eaf616d67db0764b3bb983962b4f385a1f08304fd30c7283954e6a7869b", + "sha256:ca34efc80a29351897e18888c71c6aca4a359247c87e0b1c7ada14f0ab0c0fb2", + "sha256:ca989b91cf3a3ba28930a9fc1e9aeafc2a395448641df1f387a2d394638943b0", + "sha256:d02a5399126a53492415d4906ab0ad0375a5456cc05c3fc0fc4ca11771745cda", + "sha256:d17bc7c2ccf49c478c5bdd447594e82692c74222698cfc9b5daae7ae7e90743b", + "sha256:d5bf6545cd27aaa8a13033ce56354ed9e25ab0e4ac3b5392b763d8d04b08e0c5", + "sha256:d6b430a9938a5a5d85fc107d852262ddcd48602c120e3dbb02137c83d212b380", + "sha256:da248f93f0418a9e9d94b0080d7ebc407a9a5e6d0b57bb30db9b5cc28de1ad33", + "sha256:da4dd7c9c50c059aba52b3524f84d7de956f7fef88f0bafcf4ad7dde94a064e8", + "sha256:df0623dcf9668ad0445e0558a21211d4e9a149ea8f5666917c8eeec515f0a6d1", + "sha256:e5168986b90a8d1f2f9dc1b841467c74221bd752537b99761a93d2d981e04889", + "sha256:efa29c2fe6b4fdd32e8ef81c1528506895eca86e1d8c4657fda04c9b3786ddf9", + "sha256:f1496ea22ca2c830cbcbd473de8f114a320da308438ae65abad6bab7867fe38f", + "sha256:f49e52d174375a7def9915c9f06ec4e569d235ad428f70751765f48d5926678c" ], "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'", - "version": "==4.9.1" + "version": "==4.9.2" }, "markupsafe": { "hashes": [ @@ -665,31 +672,31 @@ }, "packaging": { "hashes": [ - "sha256:2198ec20bd4c017b8f9717e00f0c8714076fc2fd93816750ab48e2c41de2cfd3", - "sha256:957e2148ba0e1a3b282772e791ef1d8083648bc131c8ab0c1feba110ce1146c3" + "sha256:714ac14496c3e68c99c29b00845f7a2b85f3bb6f1078fd9f72fd20f0570002b2", + "sha256:b6ad297f8907de0fa2fe1ccbd26fdaf387f5f47c7275fedf8cce89f99446cf97" ], "markers": "python_version >= '3.7'", - "version": "==22.0" + "version": "==23.0" }, "protobuf": { "hashes": [ - "sha256:25266bf373ee06d5d66f9eb1ec9d434b243dccce5c32faf151054cfa6f9dcbf1", - "sha256:260e346927fd4e6fbb49ab545137b19610c24a1d853dc5f29ddf777ab1987211", - "sha256:2c6a4d13732d9b094db31b3841986c38b17ac61a3fe05ee26a779d94c4c3fb43", - "sha256:4922e3320ed70e81f05060822da36923d09fd9e04e17f411f2d8d8d0070f9f5c", - "sha256:4b75c947289a2e9c1f37d21c593f1ef6fb4fed33977dfb2ac84f799eb29a8ff4", - "sha256:4d01ef83517c181d60ea1c6d0b2f644be250ade740d6554a2f5a021b1ad622e3", - "sha256:553e35c0878f6855e55f01a14561e6bce6df79b6636a5acf83b9d9ac7eab7922", - "sha256:85ccb4753ee21de7dc81a7a68a051f25dbe133ffa01a639ac998427d0b223387", - "sha256:a5a14b907a191319e7a58b38c583bbf50deb21e002f723a912c5e4f6969a778e", - "sha256:a944dc9550baae276afc7dc8193191d4c2ad660270a1e5ed5a71539817ebe2e2", - "sha256:bab4b21a986ded225b9392c07ce21c35d790951f51e1ebfd32e4d443b05c3726", - "sha256:c3b9e329b4c247dc3ba5c50f60915a84e08278eb6d9e3fa674d0d04ff816bfd7", - "sha256:d91a47c77b33580024b0271b65bb820c4e0264c25eb49151ad01e691de8fa0b6", - "sha256:efb16b16fd3eef25357f84d516062753014b76279ce4e0ec4880badd2fba7370" + "sha256:1f22ac0ca65bb70a876060d96d914dae09ac98d114294f77584b0d2644fa9c30", + "sha256:237216c3326d46808a9f7c26fd1bd4b20015fb6867dc5d263a493ef9a539293b", + "sha256:27f4d15021da6d2b706ddc3860fac0a5ddaba34ab679dc182b60a8bb4e1121cc", + "sha256:299ea899484ee6f44604deb71f424234f654606b983cb496ea2a53e3c63ab791", + "sha256:3d164928ff0727d97022957c2b849250ca0e64777ee31efd7d6de2e07c494717", + "sha256:6ab80df09e3208f742c98443b6166bcb70d65f52cfeb67357d52032ea1ae9bec", + "sha256:78a28c9fa223998472886c77042e9b9afb6fe4242bd2a2a5aced88e3f4422aa7", + "sha256:7cd532c4566d0e6feafecc1059d04c7915aec8e182d1cf7adee8b24ef1e2e6ab", + "sha256:89f9149e4a0169cddfc44c74f230d7743002e3aa0b9472d8c28f0388102fc4c2", + "sha256:a53fd3f03e578553623272dc46ac2f189de23862e68565e83dde203d41b76fc5", + "sha256:b135410244ebe777db80298297a97fbb4c862c881b4403b71bac9d4107d61fd1", + "sha256:b98d0148f84e3a3c569e19f52103ca1feacdac0d2df8d6533cf983d1fda28462", + "sha256:d1736130bce8cf131ac7957fa26880ca19227d4ad68b4888b3be0dea1f95df97", + "sha256:f45460f9ee70a0ec1b6694c6e4e348ad2019275680bd68a1d9314b8c7e01e574" ], "markers": "python_version >= '3.7'", - "version": "==4.21.11" + "version": "==4.21.12" }, "pyaes": { "hashes": [ @@ -774,11 +781,11 @@ }, "pygments": { "hashes": [ - "sha256:56a8508ae95f98e2b9bdf93a6be5ae3f7d8af858b43e02c5a2ff083726be40c1", - "sha256:f643f331ab57ba3c9d89212ee4a2dabc6e94f117cf4eefde99a0574720d14c42" + "sha256:b3ed06a9e8ac9a9aae5a6f5dbe78a8a58655d17b43b93c078f094ddc476ae297", + "sha256:fa7bd7bd2771287c0de303af8bfdfc731f51bd2c6a47ab69d117138893b82717" ], "markers": "python_version >= '3.6'", - "version": "==2.13.0" + "version": "==2.14.0" }, "pyparsing": { "hashes": [ @@ -822,18 +829,18 @@ }, "python-twitter-v2": { "hashes": [ - "sha256:18c14853da8b499775a11a3f5e1d0692a7017fa41eca91ac5afa73f35b935a90", - "sha256:fbe582ae7c6b33f6055b97e23dd106874e6650091d257fe67bfd024b96ebf8d6" + "sha256:2397d518c17bfbc16a3d414b1cf6d3c231fd8d322f21c755ac2215c9ee675537", + "sha256:4e03a30b2570fa4f17fbc7293d850fb8276c66be106d55e460b9287de37e1dd2" ], "index": "pypi", - "version": "==0.8.0" + "version": "==0.8.1" }, "pytz": { "hashes": [ - "sha256:222439474e9c98fced559f1709d89e6c9cbf8d79c794ff3eb9f8800064291427", - "sha256:e89512406b793ca39f5971bc999cc538ce125c0e51c27941bef4568b460095e2" + "sha256:7ccfae7b4b2c067464a6733c6261673fdb8fd1be905460396b97a073e9fa683a", + "sha256:93007def75ae22f7cd991c84e02d434876818661f8df9ad5df9e950ff4e52cfd" ], - "version": "==2022.6" + "version": "==2022.7" }, "pytz-deprecation-shim": { "hashes": [ @@ -1009,11 +1016,11 @@ }, "rich": { "hashes": [ - "sha256:a4eb26484f2c82589bd9a17c73d32a010b1e29d89f1604cd9bf3a2097b81bb5e", - "sha256:ba3a3775974105c221d31141f2c116f4fd65c5ceb0698657a11e9f295ec93fd0" + "sha256:25f83363f636995627a99f6e4abc52ed0970ebbd544960cc63cbb43aaac3d6f0", + "sha256:41fe1d05f433b0f4724cda8345219213d2bfa472ef56b2f64f415b5b94d51b04" ], - "markers": "python_version < '4' and python_full_version >= '3.6.3'", - "version": "==12.6.0" + "markers": "python_version >= '3.7'", + "version": "==13.0.1" }, "rsa": { "hashes": [ @@ -1080,11 +1087,11 @@ }, "telethon": { "hashes": [ - "sha256:148ac8c27908853d5d8a116d55ce947e9ba167bb697c75226ae95645b2e5a504", - "sha256:de7a1619110a2c06390fb5340839c6503c6b108b5f1a2f3bbe1ef60f02cecacb" + "sha256:3ec7ea04e61e0179dd08b974b609814e1a5298eeda3d68368a34bba754f43aec", + "sha256:d894f6ef2bf2cb119f6413b9f620957503785bab0999694b4bf67dea36f8ee09" ], "index": "pypi", - "version": "==1.26.0" + "version": "==1.26.1" }, "text-unidecode": { "hashes": [ @@ -1289,11 +1296,11 @@ "develop": { "autopep8": { "hashes": [ - "sha256:8b1659c7f003e693199f52caffdc06585bb0716900bbc6a7442fd931d658c077", - "sha256:ad924b42c2e27a1ac58e432166cc4588f5b80747de02d0d35b1ecbd3e7d57207" + "sha256:be5bc98c33515b67475420b7b1feafc8d32c1a69862498eda4983b45bffd2687", + "sha256:d27a8929d8dcd21c0f4b3859d2d07c6c25273727b98afc984c039df0f0d86566" ], "index": "pypi", - "version": "==2.0.0" + "version": "==2.0.1" }, "pycodestyle": { "hashes": [ @@ -1308,7 +1315,7 @@ "sha256:939de3e7a6161af0c887ef91b7d41a53e7c5a1ca976325f429cb46ea9bc30ecc", "sha256:de526c12914f0c550d15924c62d72abc48d6fe7364aa87328337a31007fe8a4f" ], - "markers": "python_version >= '3.7'", + "markers": "python_version < '3.11'", "version": "==2.0.1" } } diff --git a/src/configs/v2config.py b/src/configs/v2config.py index 7260d41..5b47d0f 100644 --- a/src/configs/v2config.py +++ b/src/configs/v2config.py @@ -6,6 +6,7 @@ from typing import List from archivers import Archiverv2 from feeders import Feeder from databases import Database +from formatters import Formatter from storages import StorageV2 from steps.step import Step from enrichers import Enricher @@ -21,13 +22,14 @@ class ConfigV2: Enricher, Archiverv2, Database, - StorageV2 + StorageV2, + Formatter # Util ] feeder: Step # TODO:= BaseFeeder + formatter: Formatter archivers: List[Archiverv2] = field(default_factory=[]) # TODO: fix type enrichers: List[Enricher] = field(default_factory=[]) - formatters: List[Step] = field(default_factory=[]) # TODO: fix type storages: List[Step] = field(default_factory=[]) # TODO: fix type databases: List[Database] = field(default_factory=[]) @@ -50,6 +52,7 @@ class ConfigV2: for configurable in self.configurable_parents: child: Step for child in configurable.__subclasses__(): + assert child.configs() is not None and type(child.configs()) == dict, f"class '{child.name}' should have a configs method returning a dict." for config, details in child.configs().items(): assert "." not in child.name, f"class prop name cannot contain dots('.'): {child.name}" assert "." not in config, f"config property cannot contain dots('.'): {config}" @@ -87,6 +90,7 @@ class ConfigV2: # print("config.py", self.config) self.feeder = Feeder.init(steps.get("feeder", "cli_feeder"), self.config) + self.formatter = Formatter.init(steps.get("formatter", "html_formatter"), self.config) self.enrichers = [Enricher.init(e, self.config) for e in steps.get("enrichers", [])] self.archivers = [Archiverv2.init(e, self.config) for e in steps.get("archivers", [])] self.databases = [Database.init(e, self.config) for e in steps.get("databases", [])] @@ -97,6 +101,7 @@ class ConfigV2: print("archivers", [e for e in self.archivers]) print("databases", [e for e in self.databases]) print("storages", [e for e in self.storages]) + print("formatter", self.formatter) def validate(self): pass diff --git a/src/formatters/__init__.py b/src/formatters/__init__.py new file mode 100644 index 0000000..07a52a0 --- /dev/null +++ b/src/formatters/__init__.py @@ -0,0 +1,2 @@ +from .formatter import Formatter +from .html_formatter import HtmlFormatter \ No newline at end of file diff --git a/src/formatters/formatter.py b/src/formatters/formatter.py new file mode 100644 index 0000000..7199be2 --- /dev/null +++ b/src/formatters/formatter.py @@ -0,0 +1,21 @@ +from __future__ import annotations +from dataclasses import dataclass +from abc import abstractmethod +from metadata import Metadata +from steps.step import Step + + +@dataclass +class Formatter(Step): + name = "formatter" + + def __init__(self, config: dict) -> None: + # without this STEP.__init__ is not called + super().__init__(config) + + def init(name: str, config: dict) -> Formatter: + # only for code typing + return Step.init(name, config, Formatter) + + @abstractmethod + def format(self, item) -> Metadata: return None \ No newline at end of file diff --git a/src/formatters/html_formatter.py b/src/formatters/html_formatter.py new file mode 100644 index 0000000..6c278f5 --- /dev/null +++ b/src/formatters/html_formatter.py @@ -0,0 +1,36 @@ +from __future__ import annotations +from dataclasses import dataclass +from abc import abstractmethod +from metadata import Metadata +from media import Media +from formatters import Formatter +from jinja2 import Environment, FileSystemLoader +import uuid, os, pathlib + + +@dataclass +class HtmlFormatter(Formatter): + name = "html_formatter" + + def __init__(self, config: dict) -> None: + # without this STEP.__init__ is not called + super().__init__(config) + self.environment = Environment(loader=FileSystemLoader(os.path.join(pathlib.Path(__file__).parent.resolve(), "templates/"))) + self.template = self.environment.get_template("html_template.html") + + @staticmethod + def configs() -> dict: + return {} + + def format(self, item: Metadata) -> Media: + print("FORMATTING") + content = self.template.render( + url=item.get_url(), + title=item.get_title(), + media=item.media, + metadata=item.get_clean_metadata() + ) + html_path = os.path.join(item.get("tmp_dir"), f"formatted{str(uuid.uuid4())}.html") + with open(html_path, mode="w", encoding="utf-8") as outf: + outf.write(content) + return Media(filename=html_path) diff --git a/src/formatters/templates/html_template.html b/src/formatters/templates/html_template.html new file mode 100644 index 0000000..fa278eb --- /dev/null +++ b/src/formatters/templates/html_template.html @@ -0,0 +1,101 @@ +{# templates/results.html #} + + + + + + + + {{ url }} + + + + +

    Archived media for {{ url }}

    +

    title: '{{ title }}'

    +

    content {{ media | length }} item(s)

    + + + + + + {% for m in media %} + + + + + {% endfor %} +
    aboutpreview
    +
      +
    • ARCHIVE
    • + {% if m.hash | length > 1 %} +
    • hash: {{ m.hash }}
    • + {% endif %} +
    • key: {{ m.key }}
    • +
    • type: {{ m.mimetype }}
    • +
    + +
    + {% if 'image' in m.mimetype %} + + {% elif 'video' in m.mimetype %} + + {% elif 'audio' in m.mimetype %} + + {% else %} + No preview available, please open the link. + {% endif %} +
    +

    metadata

    + + + + + + {% for key in metadata %} + + + + + {% endfor %} +
    keyvalue
    {{ key }}{{ metadata[key] }}
    + + + + \ No newline at end of file diff --git a/src/media.py b/src/media.py index c499b5b..58eae27 100644 --- a/src/media.py +++ b/src/media.py @@ -3,8 +3,7 @@ from __future__ import annotations from ast import List from typing import Any, Union, Dict from dataclasses import dataclass -from datetime import datetime -import json +import mimetypes @dataclass @@ -12,5 +11,11 @@ class Media: filename: str key: str = None cdn_url: str = None + mimetype: str = None # eg: image/jpeg # id: str = None # hash: str = None # TODO: added by enrichers + + def set_mimetype(self) -> Media: + if not self.mimetype: + self.mimetype = mimetypes.guess_type(self.filename)[0] + return self diff --git a/src/metadata.py b/src/metadata.py index f48c636..ceece8d 100644 --- a/src/metadata.py +++ b/src/metadata.py @@ -3,7 +3,8 @@ from __future__ import annotations from ast import List, Set from typing import Any, Union, Dict from dataclasses import dataclass, field -import datetime +import datetime, mimetypes +from loguru import logger # import json from media import Media @@ -12,9 +13,11 @@ from media import Media @dataclass class Metadata: status: str = "" + _processed_at: datetime = datetime.datetime.utcnow().replace(tzinfo=datetime.timezone.utc) metadata: Dict[str, Any] = field(default_factory=dict) tmp_keys: Set[str] = field(default_factory=set) # keys that are not to be saved in DBs media: List[Media] = field(default_factory=list) + final_media: Media = None # can be overwritten by formatters rearchivable: bool = False # def __init__(self, url, metadata = {}) -> None: @@ -85,13 +88,20 @@ class Metadata: return ts def add_media(self, media: Media) -> Metadata: - # print(f"adding {filename} to {self.metadata.get('media')}") - # return self.set("media", self.get_media() + [filename]) - # return self.get_media().append(media) + media.set_mimetype() return self.media.append(media) + def set_final_media(self, final: Media) -> Metadata: + if final: + if self.final_media: + logger.warning(f"overwriting final media value :{self.final_media} with {final}") + final.set_mimetype() + self.final_media = final + return self + def get_single_media(self) -> Media: - # TODO: check if formatters were applied and choose with priority + if self.final_media: + return self.final_media return self.media[0] # def as_json(self) -> str: @@ -99,6 +109,12 @@ class Metadata: # return json.dumps(self.metadata) # #TODO: datetime is not serializable + def get_clean_metadata(self) -> Metadata: + return dict( + {k: v for k, v in self.metadata.items() if k not in self.tmp_keys}, + **{"processed_at": self._processed_at} # TODO: move to enrichment + ) + def cleanup(self) -> Metadata: # TODO: refactor so it returns a JSON with all intended properties, except tmp_keys # the code below leads to errors if database needs tmp_keys after they are removed diff --git a/src/orchestrator.py b/src/orchestrator.py index 3bc5ea7..5a8ff31 100644 --- a/src/orchestrator.py +++ b/src/orchestrator.py @@ -4,6 +4,8 @@ from typing import Union, Dict from dataclasses import dataclass from archivers import Archiverv2 +from feeders import Feeder +from formatters import Formatter from storages import StorageV2 from enrichers import Enricher from databases import Database @@ -13,7 +15,6 @@ import tempfile, time, traceback from loguru import logger - """ how not to couple the different pieces of logic due to the use of constants for the metadata keys? @@ -132,7 +133,8 @@ class ArchivingOrchestrator: # Archiver.init(a, config) # for a in config.archivers # ] - self.feeder = config.feeder + self.feeder : Feeder = config.feeder + self.formatter : Formatter = config.formatter self.enrichers = config.enrichers self.archivers: List[Archiverv2] = config.archivers self.databases: List[Database] = config.databases @@ -237,14 +239,21 @@ class ArchivingOrchestrator: for e in self.enrichers: result.merge(e.enrich(result)) - # formatters, enrichers, and storages will sometimes look for specific properties: eg
  • Screenshot:
  • - for f in self.formatters: - result.merge(f.format(result)) - - # storage + # store media + unstored_media = result.media[::] + result.media = [] for s in self.storages: - for i, m in enumerate(result.media): - result.media[i] = s.store(m, result) + for m in unstored_media: + result.media.append(s.store(m, result)) + + # formatters, enrichers, and storages will sometimes look for specific properties: eg
  • Screenshot:
  • + # TODO: should there only be 1 formatter? + # for f in self.formatters: + # result.merge(f.format(result)) + # final format and store it + if (final_media := self.formatter.format(result)): + for s in self.storages: + result.set_final_media(s.store(final_media, result)) # signal completion to databases (DBs, Google Sheets, CSV, ...) # a hash registration service could be one database: forensic archiving diff --git a/src/steps/gsheet.py b/src/steps/gsheet.py index 6bfb5d7..262add1 100644 --- a/src/steps/gsheet.py +++ b/src/steps/gsheet.py @@ -12,6 +12,7 @@ class Gsheets(Step): super().__init__(config) self.gsheets_client = gspread.service_account(filename=self.service_account) assert type(self.header) == int, f"header ({self.header}) value must be an integer not {type(self.header)}" + assert self.sheet is not None, "You need to define a sheet name in your orchestration file when using gsheets." @staticmethod def configs() -> dict: From 0cb593fd2169f3134c47cd0446cab7369d7d6262 Mon Sep 17 00:00:00 2001 From: msramalho <19508417+msramalho@users.noreply.github.com> Date: Wed, 11 Jan 2023 00:03:47 +0000 Subject: [PATCH 033/190] wayback enricher ready --- src/archivers/telethon_archiverv2.py | 2 +- src/databases/gsheet_db.py | 6 +- src/enrichers/__init__.py | 3 +- src/enrichers/enricher.py | 2 +- src/enrichers/screenshot_enricher.py | 23 ++++--- src/enrichers/wayback_enricher.py | 68 +++++++++++++++++++ src/formatters/html_formatter.py | 2 +- src/formatters/templates/html_template.html | 7 +- src/media.py | 5 +- src/metadata.py | 18 ++++- src/orchestrator.py | 75 ++------------------- 11 files changed, 121 insertions(+), 90 deletions(-) create mode 100644 src/enrichers/wayback_enricher.py diff --git a/src/archivers/telethon_archiverv2.py b/src/archivers/telethon_archiverv2.py index ea19c92..66ecd74 100644 --- a/src/archivers/telethon_archiverv2.py +++ b/src/archivers/telethon_archiverv2.py @@ -121,7 +121,7 @@ class TelethonArchiver(Archiverv2): media_posts = self._get_media_posts_in_group(chat, post) logger.debug(f'got {len(media_posts)=} for {url=}') - tmp_dir = item.get("tmp_dir") + tmp_dir = item.get_tmp_dir() group_id = post.grouped_id if post.grouped_id is not None else post.id title = post.message diff --git a/src/databases/gsheet_db.py b/src/databases/gsheet_db.py index ba3785a..26aae68 100644 --- a/src/databases/gsheet_db.py +++ b/src/databases/gsheet_db.py @@ -68,13 +68,15 @@ class GsheetsDb(Database): batch_if_valid('title', item.get_title()) batch_if_valid('text', item.get("content", "")[:500]) batch_if_valid('timestamp', item.get_timestamp()) + if (screenshot := item.get_media_by_id("screenshot")): + batch_if_valid('screenshot', screenshot.cdn_url) + # batch_if_valid('status', item.status) # TODO: AFTER ENRICHMENTS # batch_if_valid('hash', media.hash) # batch_if_valid('thumbnail', result.thumbnail, f'=IMAGE("{result.thumbnail}")') # batch_if_valid('thumbnail_index', result.thumbnail_index) # batch_if_valid('duration', result.duration, str(result.duration)) - # batch_if_valid('screenshot', result.screenshot) # if result.wacz is not None: # batch_if_valid('wacz', result.wacz) # batch_if_valid('replaywebpage', f'https://replayweb.page/?source={quote(result.wacz)}#view=pages&url={quote(url)}') @@ -91,5 +93,5 @@ class GsheetsDb(Database): def _retrieve_gsheet(self, item: Metadata) -> Tuple[GWorksheet, int]: gw: GWorksheet = item.get("gsheet").get("worksheet") row: int = item.get("gsheet").get("row") - #TODO: to make gsheet_db less coupled with gsheet_feeder's "gsheet" parameter, this method could 1st try to fetch "gsheet" from item and, if missing, manage its own singleton - not needed for now + # TODO: to make gsheet_db less coupled with gsheet_feeder's "gsheet" parameter, this method could 1st try to fetch "gsheet" from item and, if missing, manage its own singleton - not needed for now return gw, row diff --git a/src/enrichers/__init__.py b/src/enrichers/__init__.py index 503ea2c..2a871d1 100644 --- a/src/enrichers/__init__.py +++ b/src/enrichers/__init__.py @@ -1,2 +1,3 @@ from .enricher import Enricher -from .screenshot_enricher import ScreenshotEnricher \ No newline at end of file +from .screenshot_enricher import ScreenshotEnricher +from .wayback_enricher import WaybackEnricher \ No newline at end of file diff --git a/src/enrichers/enricher.py b/src/enrichers/enricher.py index faf43d8..9d11276 100644 --- a/src/enrichers/enricher.py +++ b/src/enrichers/enricher.py @@ -18,4 +18,4 @@ class Enricher(Step, ABC): return Step.init(name, config, Enricher) @abstractmethod - def enrich(self, item: Metadata) -> Metadata: pass + def enrich(self, to_enrich: Metadata) -> None: pass diff --git a/src/enrichers/screenshot_enricher.py b/src/enrichers/screenshot_enricher.py index 5018859..b008e52 100644 --- a/src/enrichers/screenshot_enricher.py +++ b/src/enrichers/screenshot_enricher.py @@ -1,13 +1,14 @@ +from media import Media from utils import Webdriver from . import Enricher from metadata import Metadata from loguru import logger +import time, uuid, os from selenium.common.exceptions import TimeoutException -import time class ScreenshotEnricher(Enricher): - name = "screenshot" + name = "screenshot_enricher" @staticmethod def configs() -> dict: @@ -17,16 +18,18 @@ class ScreenshotEnricher(Enricher): "timeout": {"default": 60, "help": "timeout for taking the screenshot"} } - def enrich(self, item: Metadata) -> Metadata: - url = self.get_url(item) - print(f"enriching {url=}") - with Webdriver(self.width, self.height, self.timeout, 'facebook.com' in url) as driver: # TODO: make a util + def enrich(self, to_enrich: Metadata) -> None: + url = to_enrich.get_url() + logger.debug(f"Enriching screenshot for {url=}") + with Webdriver(self.width, self.height, self.timeout, 'facebook.com' in url) as driver: try: driver.get(url) time.sleep(2) + screenshot_file = os.path.join(to_enrich.get_tmp_dir(), f"screenshot_{str(uuid.uuid4())[0:8]}.png") + driver.save_screenshot(screenshot_file) + to_enrich.add_media(Media(filename=screenshot_file, id="screenshot")) except TimeoutException: logger.info("TimeoutException loading page for screenshot") - - #TODO: return saved object - driver.save_screenshot("TODO-HASH_OR_UUID.png") - return None + except Exception as e: + logger.error(f"Got error while loading webdriver for screenshot enricher: {e}") + # return None diff --git a/src/enrichers/wayback_enricher.py b/src/enrichers/wayback_enricher.py new file mode 100644 index 0000000..09a43e0 --- /dev/null +++ b/src/enrichers/wayback_enricher.py @@ -0,0 +1,68 @@ +from utils import Webdriver +from . import Enricher +from metadata import Metadata +from loguru import logger +from selenium.common.exceptions import TimeoutException +import time, requests + + +class WaybackEnricher(Enricher): + """ + Submits the current URL to the webarchive and returns a job_id or completed archive + """ + name = "wayback_enricher" + + def __init__(self, config: dict) -> None: + # without this STEP.__init__ is not called + super().__init__(config) + assert type(self.secret) == str and len(self.secret) > 0, "please provide a value for the wayback_enricher API key" + assert type(self.secret) == str and len(self.secret) > 0, "please provide a value for the wayback_enricher API secret" + + @staticmethod + def configs() -> dict: + return { + "timeout": {"default": 5, "help": "number of seconds to wait for a response from webarchive's wayback machine, after that only job_id is saved but page will still be processed."}, + "key": {"default": None, "help": "wayback API key. to get credentials visit https://archive.org/account/s3.php"}, + "secret": {"default": None, "help": "wayback API secret. to get credentials visit https://archive.org/account/s3.php"} + } + + def enrich(self, to_enrich: Metadata) -> None: + url = to_enrich.get_url() + logger.debug(f"Enriching wayback for {url=}") + + ia_headers = { + "Accept": "application/json", + "Authorization": f"LOW {self.key}:{self.secret}" + } + r = requests.post('https://web.archive.org/save/', headers=ia_headers, data={'url': url}) + + if r.status_code != 200: + logger.error(em:=f"Internet archive failed with status of {r.status_code}: {r.json()}") + to_enrich.set("wayback", em) + return + + # check job status + job_id = r.json()['job_id'] + + # waits at most timeout seconds until job is completed, otherwise only enriches the job_id information + start_time = time.time() + wayback_url = False + attempt = 1 + while not wayback_url and time.time() - start_time <= self.timeout: + try: + + logger.debug(f"GETting status for {job_id=} on {url=} ({attempt=})") + r_status = requests.get(f'https://web.archive.org/save/status/{job_id}', headers=ia_headers) + r_json = r_status.json() + if r_status.status_code == 200 and r_json['status'] == 'success': + wayback_url = f"https://web.archive.org/web/{r_json['timestamp']}/{r_json['original_url']}" + except Exception as e: + logger.warning(f"error fetching status for {url=} due to: {e}") + if not wayback_url: + attempt += 1 + time.sleep(1) # TODO: can be improved with exponential backoff + + if wayback_url: + to_enrich.set("wayback", wayback_url) + else: + to_enrich.set("wayback", {"job_id": job_id, "check_status": f'https://web.archive.org/save/status/{job_id}'}) diff --git a/src/formatters/html_formatter.py b/src/formatters/html_formatter.py index 6c278f5..7443568 100644 --- a/src/formatters/html_formatter.py +++ b/src/formatters/html_formatter.py @@ -30,7 +30,7 @@ class HtmlFormatter(Formatter): media=item.media, metadata=item.get_clean_metadata() ) - html_path = os.path.join(item.get("tmp_dir"), f"formatted{str(uuid.uuid4())}.html") + html_path = os.path.join(item.get_tmp_dir(), f"formatted{str(uuid.uuid4())}.html") with open(html_path, mode="w", encoding="utf-8") as outf: outf.write(content) return Media(filename=html_path) diff --git a/src/formatters/templates/html_template.html b/src/formatters/templates/html_template.html index fa278eb..fc986f0 100644 --- a/src/formatters/templates/html_template.html +++ b/src/formatters/templates/html_template.html @@ -60,6 +60,9 @@ {% endif %}
  • key: {{ m.key }}
  • type: {{ m.mimetype }}
  • + {% if m.id | length >0 %} +
  • id: {{ m.id }}
  • + {% endif %} @@ -91,11 +94,13 @@ {% for key in metadata %} {{ key }} - {{ metadata[key] }} + {{ metadata[key] | urlize }} {% endfor %} +
    +

    made with bellingcat/auto-archiver, add suggestions and report issues on the project's github page

    \ No newline at end of file diff --git a/src/media.py b/src/media.py index 58eae27..3c416be 100644 --- a/src/media.py +++ b/src/media.py @@ -12,10 +12,13 @@ class Media: key: str = None cdn_url: str = None mimetype: str = None # eg: image/jpeg - # id: str = None + id: str = None # in case this type of media needs a special id, eg: screenshot # hash: str = None # TODO: added by enrichers def set_mimetype(self) -> Media: if not self.mimetype: self.mimetype = mimetypes.guess_type(self.filename)[0] return self + + def is_video(self) -> bool: + return self.mimetype.startswith("video") diff --git a/src/metadata.py b/src/metadata.py index ceece8d..7af923c 100644 --- a/src/metadata.py +++ b/src/metadata.py @@ -28,9 +28,12 @@ class Metadata: """ merges two Metadata instances, will overwrite according to overwrite_left flag """ + if right is None: return self if overwrite_left: - self.status = right.status + if right.status and len(right.status): + self.status = right.status self.rearchivable |= right.rearchivable + self.tmp_keys |= right.tmp_keys for k, v in right.metadata.items(): assert k not in self.metadata or type(v) == type(self.get(k)) if type(v) not in [dict, list, set] or k not in self.metadata: @@ -76,6 +79,12 @@ class Metadata: def get_title(self) -> str: return self.get("title") + def set_tmp_dir(self, tmp_dir: str) -> Metadata: + return self.set("tmp_dir", tmp_dir, True) + + def get_tmp_dir(self) -> str: + return self.get("tmp_dir") + def set_timestamp(self, timestamp: datetime.datetime) -> Metadata: assert type(timestamp) == datetime.datetime, "set_timestamp expects a datetime instance" return self.set("timestamp", timestamp) @@ -88,9 +97,15 @@ class Metadata: return ts def add_media(self, media: Media) -> Metadata: + if media is None: return media.set_mimetype() return self.media.append(media) + def get_media_by_id(self, id:str) -> Media: + for m in self.media: + if m.id == id: return m + return None + def set_final_media(self, final: Media) -> Metadata: if final: if self.final_media: @@ -100,6 +115,7 @@ class Metadata: return self def get_single_media(self) -> Media: + #TODO: could be refactored to use a custom media.id if self.final_media: return self.final_media return self.media[0] diff --git a/src/orchestrator.py b/src/orchestrator.py index 5a8ff31..3d554e0 100644 --- a/src/orchestrator.py +++ b/src/orchestrator.py @@ -52,74 +52,6 @@ Cisticola considerations: 2. So the auto-archiver becomes like a puzzle and fixes to Cisticola scrapers can immediately benefit it, and contributions are focused on a single source or scraping """ -# @dataclass -# class Metadata: -# # does not handle files, only primitives -# # the only piece of logic to handle files is the archiver, enricher, and storage -# status: str -# # title: str -# # url: str -# # hash: str -# main_file: Metadata -# metadata: Dict[str, Metadata] - -# @staticmethod -# def merge(left, right : Metadata, overwrite_left=True) -> Metadata: -# # should return a merged version of the Metadata -# # will work for archived() and enriched() -# # what if 2 metadatas contain the same keys? only one can remain! : overwrite_left -# pass - -# def get(self, key) -> Union[Metadata, str]: -# # goes through metadata and returns the Metadata available -# pass - -# def as_json(self) -> str: -# # converts all metadata and data into JSON -# pass - - -""" -@dataclass -class ArchiveResult: - # maybe metadata can have status as well, eg: screenshot fails. should that be registered in the databases? likely yes - status: str - url: str - metadata: Metadata - # title, url, hash, other={} - # cdn_url: str = None - # thumbnail: str = None - # thumbnail_index: str = None - # duration: float = None - # title: str = None - # timestamp: datetime.datetime = None - # screenshot: str = None - # wacz: str = None - # hash: str = None - # media: list = field(default_factory=list) - - def __init__(self) -> None: pass - - def update(self, metadata) -> None: - # receive a Metadata instance and update itself with it! - pass - - def as_json(self) -> str: - # converts all metadata and data into JSON - pass -""" - -""" -There is a Superclass for: - * Database (should_process) - -How can GSheets work? it needs to feed from a READER (GSheets Feeder) - -Once an archiver returns a link to a local file (for eg to a storage), how do we then delete the produced local files? -The context metadata should include a temporary folder (maybe a LocalStorage instance?) -""" - - class ArchivingOrchestrator: def __init__(self, config) -> None: # in config.py we should test that the archivers exist and log mismatches (blocking execution) @@ -128,7 +60,7 @@ class ArchivingOrchestrator: # Is it possible to overwrite config.yaml values? it could be useful: share config file and modify gsheet_feeder.sheet via CLI # where does that update/processing happen? in config.py - # reflection for Archiver to know wihch child classes it has? use Archiver.__subclasses__ + # reflection for Archiver to know which child classes it has? use Archiver.__subclasses__ # self.archivers = [ # Archiver.init(a, config) # for a in config.archivers @@ -166,7 +98,7 @@ class ArchivingOrchestrator: print("ARCHIVING", item) try: with tempfile.TemporaryDirectory(dir="./") as tmp_dir: - item.set("tmp_dir", tmp_dir, True) + item.set_tmp_dir(tmp_dir) result = self.archive(item) print(result) except KeyboardInterrupt: @@ -226,6 +158,7 @@ class ArchivingOrchestrator: # do they need to be refreshed with every execution? # this is where the Hashes come from, the place with access to all content # the archiver does not have access to storage + # a.download(result) # TODO: refactor so there's not merge here result.merge(a.download(result)) # TODO: fix logic if True or result.is_success(): break @@ -237,7 +170,7 @@ class ArchivingOrchestrator: # maybe as a PDF? or a Markdown file # side captures: screenshot, wacz, webarchive, thumbnails, HTMLgenerator for e in self.enrichers: - result.merge(e.enrich(result)) + e.enrich(result) # store media unstored_media = result.media[::] From 6ca46417feeda7f6ac586214cbf40917f9d9b50f Mon Sep 17 00:00:00 2001 From: msramalho <19508417+msramalho@users.noreply.github.com> Date: Thu, 12 Jan 2023 02:09:39 +0000 Subject: [PATCH 034/190] local storage + multiple storage support --- src/archivers/telethon_archiverv2.py | 4 +- src/databases/gsheet_db.py | 4 +- src/enrichers/wayback_enricher.py | 2 +- src/formatters/templates/html_template.html | 19 +++++---- src/media.py | 28 +++++++++---- src/metadata.py | 2 - src/orchestrator.py | 14 +++---- src/storages/__init__.py | 3 +- src/storages/local.py | 46 +++++++++++++++++++++ src/storages/s3.py | 25 +++++------ src/storages/storage.py | 24 ++++++----- 11 files changed, 117 insertions(+), 54 deletions(-) create mode 100644 src/storages/local.py diff --git a/src/archivers/telethon_archiverv2.py b/src/archivers/telethon_archiverv2.py index 66ecd74..6851cb5 100644 --- a/src/archivers/telethon_archiverv2.py +++ b/src/archivers/telethon_archiverv2.py @@ -132,11 +132,11 @@ class TelethonArchiver(Archiverv2): if mp.entities: other_media_urls = [e.url for e in mp.entities if hasattr(e, "url") and e.url and self._guess_file_type(e.url) in ["video", "image", "audio"]] if len(other_media_urls): - logger.debug(f"Got {len(other_media_urls)} other medial urls from {mp.id=}: {other_media_urls}") + logger.debug(f"Got {len(other_media_urls)} other media urls from {mp.id=}: {other_media_urls}") for i, om_url in enumerate(other_media_urls): filename = os.path.join(tmp_dir, f'{chat}_{group_id}_{i}') self.download_from_url(om_url, filename) - result.add_media(Media(filename)) + result.add_media(Media(filename=filename, id=f"{group_id}_{i}")) filename_dest = os.path.join(tmp_dir, f'{chat}_{group_id}', str(mp.id)) filename = self.client.download_media(mp.media, filename_dest) diff --git a/src/databases/gsheet_db.py b/src/databases/gsheet_db.py index 26aae68..0cf65ed 100644 --- a/src/databases/gsheet_db.py +++ b/src/databases/gsheet_db.py @@ -63,13 +63,13 @@ class GsheetsDb(Database): media: Media = item.get_single_media() - batch_if_valid('archive', media.cdn_url) + batch_if_valid('archive', "\n".join(media.urls)) batch_if_valid('date', True, datetime.datetime.utcnow().replace(tzinfo=datetime.timezone.utc).isoformat()) batch_if_valid('title', item.get_title()) batch_if_valid('text', item.get("content", "")[:500]) batch_if_valid('timestamp', item.get_timestamp()) if (screenshot := item.get_media_by_id("screenshot")): - batch_if_valid('screenshot', screenshot.cdn_url) + batch_if_valid('screenshot', "\n".join(screenshot.urls)) # batch_if_valid('status', item.status) # TODO: AFTER ENRICHMENTS diff --git a/src/enrichers/wayback_enricher.py b/src/enrichers/wayback_enricher.py index 09a43e0..bf55923 100644 --- a/src/enrichers/wayback_enricher.py +++ b/src/enrichers/wayback_enricher.py @@ -21,7 +21,7 @@ class WaybackEnricher(Enricher): @staticmethod def configs() -> dict: return { - "timeout": {"default": 5, "help": "number of seconds to wait for a response from webarchive's wayback machine, after that only job_id is saved but page will still be processed."}, + "timeout": {"default": 15, "help": "seconds to wait for successful archive confirmation from wayback, if more than this passes the result contains the job_id so the status can later be checked manually."}, "key": {"default": None, "help": "wayback API key. to get credentials visit https://archive.org/account/s3.php"}, "secret": {"default": None, "help": "wayback API secret. to get credentials visit https://archive.org/account/s3.php"} } diff --git a/src/formatters/templates/html_template.html b/src/formatters/templates/html_template.html index fc986f0..f488a5f 100644 --- a/src/formatters/templates/html_template.html +++ b/src/formatters/templates/html_template.html @@ -26,6 +26,7 @@ table, th, td { + margin: auto; border: 1px solid; border-collapse: collapse; } @@ -43,18 +44,17 @@

    Archived media for {{ url }}

    -

    title: '{{ title }}'

    +

    title: '{{ title }}'

    content {{ media | length }} item(s)

    - + {% for m in media %} {% endfor %}
    aboutpreviewpreview(s)
      -
    • ARCHIVE
    • {% if m.hash | length > 1 %}
    • hash: {{ m.hash }}
    • {% endif %} @@ -67,25 +67,28 @@
    + {% for url in m.urls %} {% if 'image' in m.mimetype %} - + {% elif 'video' in m.mimetype %} -
    -

    metadata

    +

    metadata

    @@ -100,7 +103,7 @@
    key

    -

    made with bellingcat/auto-archiver, add suggestions and report issues on the project's github page

    +

    Made with bellingcat/auto-archiver

    \ No newline at end of file diff --git a/src/media.py b/src/media.py index 3c416be..e50cc14 100644 --- a/src/media.py +++ b/src/media.py @@ -2,7 +2,7 @@ from __future__ import annotations from ast import List from typing import Any, Union, Dict -from dataclasses import dataclass +from dataclasses import dataclass, field import mimetypes @@ -10,15 +10,25 @@ import mimetypes class Media: filename: str key: str = None - cdn_url: str = None - mimetype: str = None # eg: image/jpeg - id: str = None # in case this type of media needs a special id, eg: screenshot + urls: List[str] = field(default_factory=list) + _mimetype: str = None # eg: image/jpeg + id: str = "" # in case this type of media needs a special id, eg: screenshot # hash: str = None # TODO: added by enrichers - def set_mimetype(self) -> Media: - if not self.mimetype: - self.mimetype = mimetypes.guess_type(self.filename)[0] - return self + def add_url(self, url: str) -> None: + # url can be remote, local, ... + self.urls.append(url) + + @property # getter .mimetype + def mimetype(self) -> str: + assert self.filename is not None and len(self.filename) > 0, "cannot get mimetype from media without filename" + if not self._mimetype: + self._mimetype = mimetypes.guess_type(self.filename)[0] + return self._mimetype + + @mimetype.setter # setter .mimetype + def mimetype(self, v: str) -> None: + self._mimetype = v def is_video(self) -> bool: - return self.mimetype.startswith("video") + return self._mimetype.startswith("video") diff --git a/src/metadata.py b/src/metadata.py index 7af923c..7f57c3b 100644 --- a/src/metadata.py +++ b/src/metadata.py @@ -98,7 +98,6 @@ class Metadata: def add_media(self, media: Media) -> Metadata: if media is None: return - media.set_mimetype() return self.media.append(media) def get_media_by_id(self, id:str) -> Media: @@ -110,7 +109,6 @@ class Metadata: if final: if self.final_media: logger.warning(f"overwriting final media value :{self.final_media} with {final}") - final.set_mimetype() self.final_media = final return self diff --git a/src/orchestrator.py b/src/orchestrator.py index 3d554e0..612ea2b 100644 --- a/src/orchestrator.py +++ b/src/orchestrator.py @@ -52,6 +52,7 @@ Cisticola considerations: 2. So the auto-archiver becomes like a puzzle and fixes to Cisticola scrapers can immediately benefit it, and contributions are focused on a single source or scraping """ + class ArchivingOrchestrator: def __init__(self, config) -> None: # in config.py we should test that the archivers exist and log mismatches (blocking execution) @@ -65,8 +66,8 @@ class ArchivingOrchestrator: # Archiver.init(a, config) # for a in config.archivers # ] - self.feeder : Feeder = config.feeder - self.formatter : Formatter = config.formatter + self.feeder: Feeder = config.feeder + self.formatter: Formatter = config.formatter self.enrichers = config.enrichers self.archivers: List[Archiverv2] = config.archivers self.databases: List[Database] = config.databases @@ -173,11 +174,9 @@ class ArchivingOrchestrator: e.enrich(result) # store media - unstored_media = result.media[::] - result.media = [] for s in self.storages: - for m in unstored_media: - result.media.append(s.store(m, result)) + for m in result.media: + s.store(m, result) # modifies media # formatters, enrichers, and storages will sometimes look for specific properties: eg
  • Screenshot:
  • # TODO: should there only be 1 formatter? @@ -186,7 +185,8 @@ class ArchivingOrchestrator: # final format and store it if (final_media := self.formatter.format(result)): for s in self.storages: - result.set_final_media(s.store(final_media, result)) + s.store(final_media, result) + result.set_final_media(final_media) # signal completion to databases (DBs, Google Sheets, CSV, ...) # a hash registration service could be one database: forensic archiving diff --git a/src/storages/__init__.py b/src/storages/__init__.py index 91ce148..4c0783c 100644 --- a/src/storages/__init__.py +++ b/src/storages/__init__.py @@ -5,4 +5,5 @@ from .s3_storage import S3Config, S3Storage from .gd_storage import GDConfig, GDStorage from .storage import StorageV2 -from .s3 import S3StorageV2 \ No newline at end of file +from .s3 import S3StorageV2 +from .local import LocalStorageV2 \ No newline at end of file diff --git a/src/storages/local.py b/src/storages/local.py new file mode 100644 index 0000000..aafb28c --- /dev/null +++ b/src/storages/local.py @@ -0,0 +1,46 @@ + +import shutil +from typing import IO, Any +import boto3, uuid, os, mimetypes +from botocore.errorfactory import ClientError +from metadata import Metadata +from media import Media +from storages import StorageV2 +from loguru import logger +from slugify import slugify + + +class LocalStorageV2(StorageV2): + name = "local_storage" + + def __init__(self, config: dict) -> None: + super().__init__(config) + os.makedirs(self.save_to, exist_ok=True) + + @staticmethod + def configs() -> dict: + return { + "save_to": {"default": "./archived", "help": "folder where to save archived content"}, + "flatten": {"default": True, "help": "if true saves all files to the root of 'save_to' directory, if false preserves subdir structure"}, + "save_absolute": {"default": False, "help": "whether the path to the stored file is absolute or relative (leaks the file structure)"}, + } + + def get_cdn_url(self, media: Media) -> str: + dest = os.path.join(self.save_to, media.key) + if self.save_absolute: + dest = os.path.abspath(dest) + return dest + + def upload(self, media: Media, **kwargs) -> bool: + # override parent so that we can use shutil.copy2 and keep metadata + if self.flatten: + dest = os.path.join(self.save_to, slugify(media.key)) + else: + dest = os.path.join(self.save_to, media.key) + + os.makedirs(dest, exist_ok=True) + logger.debug(f'[{self.__class__.name}] storing file {media.filename} with key {media.key} to {dest}') + shutil.copy2(media.filename, dest) + return True + + def uploadf(self, file: IO[bytes], key: str, **kwargs: dict) -> bool: pass diff --git a/src/storages/s3.py b/src/storages/s3.py index d4457e8..acd907e 100644 --- a/src/storages/s3.py +++ b/src/storages/s3.py @@ -45,26 +45,27 @@ class S3StorageV2(StorageV2): def get_cdn_url(self, media: Media) -> str: return self.cdn_url.format(bucket=self.bucket, region=self.region, key=media.key) - def uploadf(self, file: IO[bytes], media: Media, **kwargs: dict) -> Any: + def uploadf(self, file: IO[bytes], media: Media, **kwargs: dict) -> None: extra_args = kwargs.get("extra_args", {}) if not self.private and 'ACL' not in extra_args: extra_args['ACL'] = 'public-read' if 'ContentType' not in extra_args: try: - extra_args['ContentType'] = mimetypes.guess_type(media.key)[0] + extra_args['ContentType'] = media.mimetype except Exception as e: logger.warning(f"Unable to get mimetype for {media.key=}, error: {e}") self.s3.upload_fileobj(file, Bucket=self.bucket, Key=media.key, ExtraArgs=extra_args) + return True - def exists(self, key: str) -> bool: - """ - Tests if a given file with key=key exists in the bucket - """ - try: - self.s3.head_object(Bucket=self.bucket, Key=key) - return True - except ClientError as e: - logger.warning(f"got a ClientError when checking if {key=} exists in bucket={self.bucket}: {e}") - return False + # def exists(self, key: str) -> bool: + # """ + # Tests if a given file with key=key exists in the bucket + # """ + # try: + # self.s3.head_object(Bucket=self.bucket, Key=key) + # return True + # except ClientError as e: + # logger.warning(f"got a ClientError when checking if {key=} exists in bucket={self.bucket}: {e}") + # return False diff --git a/src/storages/storage.py b/src/storages/storage.py index 06346e9..61d4c77 100644 --- a/src/storages/storage.py +++ b/src/storages/storage.py @@ -7,6 +7,7 @@ from metadata import Metadata from steps.step import Step from loguru import logger import os, uuid +from slugify import slugify @dataclass @@ -21,23 +22,26 @@ class StorageV2(Step): def init(name: str, config: dict) -> StorageV2: return Step.init(name, config, StorageV2) - def store(self, media: Media, item: Metadata) -> Media: - media = self.set_key(media, item) + def store(self, media: Media, item: Metadata) -> None: + self.set_key(media, item) self.upload(media) - media.cdn_url = self.get_cdn_url(media) - return media + media.add_url(self.get_cdn_url(media)) @abstractmethod - def uploadf(self, file: IO[bytes], key: str, **kwargs: dict) -> Any: pass + def get_cdn_url(self, media: Media) -> str: pass - def upload(self, media: Media, **kwargs) -> Any: - logger.debug(f'[{self.__class__.name}] uploading file {media.filename} with key {media.key}') + @abstractmethod + def uploadf(self, file: IO[bytes], key: str, **kwargs: dict) -> bool: pass + + def upload(self, media: Media, **kwargs) -> bool: + logger.debug(f'[{self.__class__.name}] storing file {media.filename} with key {media.key}') with open(media.filename, 'rb') as f: return self.uploadf(f, media, **kwargs) - def set_key(self, media: Media, item: Metadata) -> Media: + def set_key(self, media: Media, item: Metadata) -> None: """takes the media and optionally item info and generates a key""" + if media.key is not None and len(media.key) > 0: return folder = item.get("folder", "") ext = os.path.splitext(media.filename)[1] - media.key = os.path.join(folder, f"{str(uuid.uuid4())}{ext}") - return media + # media.key = os.path.join(folder, f"{str(uuid.uuid4())}{ext}") + media.key = os.path.join(folder, slugify(item.get_url()), f"{str(uuid.uuid4())}{ext}") From 74e50eccf11b389a1feee464ad486d850a4ef0b9 Mon Sep 17 00:00:00 2001 From: msramalho <19508417+msramalho@users.noreply.github.com> Date: Fri, 13 Jan 2023 02:12:08 +0000 Subject: [PATCH 035/190] hash enricher and media refactor --- src/archivers/telethon_archiverv2.py | 2 +- src/configs/v2config.py | 2 +- src/enrichers/__init__.py | 3 +- src/enrichers/hash_enricher.py | 41 +++++++++++ src/enrichers/screenshot_enricher.py | 2 +- src/formatters/templates/html_template.html | 77 +++++++++++++++++---- src/media.py | 13 +++- src/metadata.py | 13 ++-- 8 files changed, 129 insertions(+), 24 deletions(-) create mode 100644 src/enrichers/hash_enricher.py diff --git a/src/archivers/telethon_archiverv2.py b/src/archivers/telethon_archiverv2.py index 6851cb5..819070a 100644 --- a/src/archivers/telethon_archiverv2.py +++ b/src/archivers/telethon_archiverv2.py @@ -136,7 +136,7 @@ class TelethonArchiver(Archiverv2): for i, om_url in enumerate(other_media_urls): filename = os.path.join(tmp_dir, f'{chat}_{group_id}_{i}') self.download_from_url(om_url, filename) - result.add_media(Media(filename=filename, id=f"{group_id}_{i}")) + result.add_media(Media(filename=filename), id=f"{group_id}_{i}") filename_dest = os.path.join(tmp_dir, f'{chat}_{group_id}', str(mp.id)) filename = self.client.download_media(mp.media, filename_dest) diff --git a/src/configs/v2config.py b/src/configs/v2config.py index 5b47d0f..dec3565 100644 --- a/src/configs/v2config.py +++ b/src/configs/v2config.py @@ -57,7 +57,7 @@ class ConfigV2: assert "." not in child.name, f"class prop name cannot contain dots('.'): {child.name}" assert "." not in config, f"config property cannot contain dots('.'): {config}" config_path = f"{child.name}.{config}" - parser.add_argument(f'--{config_path}', action='store', dest=config_path, help=f"{details['help']} (defaults to {details['default']})") + parser.add_argument(f'--{config_path}', action='store', dest=config_path, help=f"{details['help']} (defaults to {details['default']})", choices=details.get("choices", None)) self.defaults[config_path] = details["default"] if "cli_set" in details: self.cli_ops[config_path] = details["cli_set"] diff --git a/src/enrichers/__init__.py b/src/enrichers/__init__.py index 2a871d1..95b3fad 100644 --- a/src/enrichers/__init__.py +++ b/src/enrichers/__init__.py @@ -1,3 +1,4 @@ from .enricher import Enricher from .screenshot_enricher import ScreenshotEnricher -from .wayback_enricher import WaybackEnricher \ No newline at end of file +from .wayback_enricher import WaybackEnricher +from .hash_enricher import HashEnricher \ No newline at end of file diff --git a/src/enrichers/hash_enricher.py b/src/enrichers/hash_enricher.py new file mode 100644 index 0000000..786c861 --- /dev/null +++ b/src/enrichers/hash_enricher.py @@ -0,0 +1,41 @@ +import hashlib +from utils import Webdriver +from . import Enricher +from metadata import Metadata +from loguru import logger +from selenium.common.exceptions import TimeoutException +import time, requests + + +class HashEnricher(Enricher): + """ + Calculates hashes for Media instances + """ + name = "hash_enricher" + + def __init__(self, config: dict) -> None: + # without this STEP.__init__ is not called + super().__init__(config) + algo_choices = self.configs()["algorithm"]["choices"] + assert self.algorithm in algo_choices, f"Invalid hash algorithm selected, must be one of {algo_choices} (you selected {self.algorithm})." + + @staticmethod + def configs() -> dict: + return { + "algorithm": {"default": "SHA-256", "help": "hash algorithm to use", "choices": ["SHA-256", "SHA3-512"]} + } + + def enrich(self, to_enrich: Metadata) -> None: + url = to_enrich.get_url() + logger.debug(f"calculating media hashes for {url=} (using {self.algorithm})") + + for i, m in enumerate(to_enrich.media): + with open(m.filename, "rb") as f: + bytes = f.read() # read entire file as bytes + hash = None + if self.algorithm == "SHA-256": + hash = hashlib.sha256(bytes) + elif self.algorithm == "SHA3-512": + hash = hashlib.sha3_512(bytes) + else: continue + to_enrich.media[i].set("hash", f"{self.algorithm}:{hash.hexdigest()}") diff --git a/src/enrichers/screenshot_enricher.py b/src/enrichers/screenshot_enricher.py index b008e52..0375e3b 100644 --- a/src/enrichers/screenshot_enricher.py +++ b/src/enrichers/screenshot_enricher.py @@ -27,7 +27,7 @@ class ScreenshotEnricher(Enricher): time.sleep(2) screenshot_file = os.path.join(to_enrich.get_tmp_dir(), f"screenshot_{str(uuid.uuid4())[0:8]}.png") driver.save_screenshot(screenshot_file) - to_enrich.add_media(Media(filename=screenshot_file, id="screenshot")) + to_enrich.add_media(Media(filename=screenshot_file), id="screenshot") except TimeoutException: logger.info("TimeoutException loading page for screenshot") except Exception as e: diff --git a/src/formatters/templates/html_template.html b/src/formatters/templates/html_template.html index f488a5f..e757cae 100644 --- a/src/formatters/templates/html_template.html +++ b/src/formatters/templates/html_template.html @@ -39,12 +39,29 @@ .center { text-align: center; } + + .copy:hover { + font-weight: 600; + cursor: copy; + } + + #notification { + position: fixed; + right: 20px; + top: 20px; + background: aquamarine; + box-shadow: 6px 8px 5px 0px #000000; + padding: 10px; + font-size: large; + display: none; + } +

    Archived media for {{ url }}

    -

    title: '{{ title }}'

    +

    title: '{{ title }}'

    content {{ media | length }} item(s)

    @@ -55,21 +72,24 @@ @@ -97,7 +120,9 @@ {% for key in metadata %} - + {% endfor %}
      - {% if m.hash | length > 1 %} -
    • hash: {{ m.hash }}
    • - {% endif %} -
    • key: {{ m.key }}
    • -
    • type: {{ m.mimetype }}
    • - {% if m.id | length >0 %} -
    • id: {{ m.id }}
    • +
    • key: {{ m.key }}
    • +
    • type: {{ m.mimetype }}
    • + + {% for prop in m.properties %} + {% if m.properties[prop] | length > 1 %} +
    • {{ prop }}: {{ m.properties[prop] }}
    • {% endif %} + {% endfor %}
    {% for url in m.urls %} + {% if 'http' in url %} {% if 'image' in m.mimetype %} - + + + {% elif 'video' in m.mimetype %}
    {{ key }}{{ metadata[key] | urlize }} + {{ metadata[key] | urlize }} +
    @@ -105,5 +130,33 @@

    Made with bellingcat/auto-archiver

    + \ No newline at end of file diff --git a/src/media.py b/src/media.py index e50cc14..f0f91a2 100644 --- a/src/media.py +++ b/src/media.py @@ -8,12 +8,19 @@ import mimetypes @dataclass class Media: + # other properties eg: hash, id, exif, ... filename: str key: str = None - urls: List[str] = field(default_factory=list) _mimetype: str = None # eg: image/jpeg - id: str = "" # in case this type of media needs a special id, eg: screenshot - # hash: str = None # TODO: added by enrichers + urls: List[str] = field(default_factory=list) + properties: dict = field(default_factory=dict) + + def set(self, key: str, value: Any) -> Media: + self.properties[key] = value + return self + + def get(self, key: str, default: Any = None) -> Any: + return self.properties.get(key, default) def add_url(self, url: str) -> None: # url can be remote, local, ... diff --git a/src/metadata.py b/src/metadata.py index 7f57c3b..70984fa 100644 --- a/src/metadata.py +++ b/src/metadata.py @@ -96,13 +96,16 @@ class Metadata: if iso: return ts.isoformat() return ts - def add_media(self, media: Media) -> Metadata: + def add_media(self, media: Media, id: str = None) -> Metadata: + # adds a new media, optionally including an id if media is None: return - return self.media.append(media) + if id is not None: media.set("id", id) + self.media.append(media) + return media - def get_media_by_id(self, id:str) -> Media: + def get_media_by_id(self, id: str) -> Media: for m in self.media: - if m.id == id: return m + if m.get("id") == id: return m return None def set_final_media(self, final: Media) -> Metadata: @@ -113,7 +116,7 @@ class Metadata: return self def get_single_media(self) -> Media: - #TODO: could be refactored to use a custom media.id + # TODO: could be refactored to use a custom media.id if self.final_media: return self.final_media return self.media[0] From 47dc7881433b6f8fe7efdbf9213e3dc2c663c964 Mon Sep 17 00:00:00 2001 From: msramalho <19508417+msramalho@users.noreply.github.com> Date: Tue, 17 Jan 2023 16:29:27 +0000 Subject: [PATCH 036/190] thumbnails enricher --- src/enrichers/__init__.py | 3 +- src/enrichers/thumbnail_enricher.py | 46 +++++++++ src/enrichers/wayback_enricher.py | 3 +- src/formatters/html_formatter.py | 44 ++++++++- src/formatters/templates/html_template.html | 104 +++++++++++++++----- src/formatters/templates/media.html | 28 ++++++ src/media.py | 7 +- src/orchestrator.py | 8 ++ 8 files changed, 208 insertions(+), 35 deletions(-) create mode 100644 src/enrichers/thumbnail_enricher.py create mode 100644 src/formatters/templates/media.html diff --git a/src/enrichers/__init__.py b/src/enrichers/__init__.py index 95b3fad..8b9220b 100644 --- a/src/enrichers/__init__.py +++ b/src/enrichers/__init__.py @@ -1,4 +1,5 @@ from .enricher import Enricher from .screenshot_enricher import ScreenshotEnricher from .wayback_enricher import WaybackEnricher -from .hash_enricher import HashEnricher \ No newline at end of file +from .hash_enricher import HashEnricher +from .thumbnail_enricher import ThumbnailEnricher \ No newline at end of file diff --git a/src/enrichers/thumbnail_enricher.py b/src/enrichers/thumbnail_enricher.py new file mode 100644 index 0000000..32e09be --- /dev/null +++ b/src/enrichers/thumbnail_enricher.py @@ -0,0 +1,46 @@ +import uuid +from media import Media +from . import Enricher +from metadata import Metadata +from loguru import logger +import ffmpeg, os + + +class ThumbnailEnricher(Enricher): + """ + Generates thumbnails for all the media + """ + name = "thumbnail_enricher" + + def __init__(self, config: dict) -> None: + # without this STEP.__init__ is not called + super().__init__(config) + + @staticmethod + def configs() -> dict: + return {} + + def enrich(self, to_enrich: Metadata) -> None: + logger.debug(f"generating thumbnails") + folder = os.path.join(to_enrich.get_tmp_dir(), str(uuid.uuid4())) + os.makedirs(folder, exist_ok=True) + for i, m in enumerate(to_enrich.media[::]): + if m.is_video(): + logger.debug(f"generating thumbnails for {m.filename}") + fps, duration = 0.5, m.get("duration") + if duration is not None: + duration = float(duration) + if duration < 60: fps = 10.0 / duration + elif duration < 120: fps = 20.0 / duration + else: fps = 40.0 / duration + + stream = ffmpeg.input(m.filename) + stream = ffmpeg.filter(stream, 'fps', fps=fps).filter('scale', 512, -1) + stream.output(os.path.join(folder, 'out%d.jpg')).run() + + thumbnails = os.listdir(folder) + thumbnails_media = [] + for t, fname in enumerate(thumbnails): + if fname[-3:] == 'jpg': + thumbnails_media.append(Media(filename=os.path.join(folder, fname)).set("id", f"thumbnail_{t}")) + to_enrich.media[i].set("thumbnails", thumbnails_media) diff --git a/src/enrichers/wayback_enricher.py b/src/enrichers/wayback_enricher.py index bf55923..429f218 100644 --- a/src/enrichers/wayback_enricher.py +++ b/src/enrichers/wayback_enricher.py @@ -37,7 +37,7 @@ class WaybackEnricher(Enricher): r = requests.post('https://web.archive.org/save/', headers=ia_headers, data={'url': url}) if r.status_code != 200: - logger.error(em:=f"Internet archive failed with status of {r.status_code}: {r.json()}") + logger.error(em := f"Internet archive failed with status of {r.status_code}: {r.json()}") to_enrich.set("wayback", em) return @@ -66,3 +66,4 @@ class WaybackEnricher(Enricher): to_enrich.set("wayback", wayback_url) else: to_enrich.set("wayback", {"job_id": job_id, "check_status": f'https://web.archive.org/save/status/{job_id}'}) + to_enrich.set("wayback lookup", f"https://web.archive.org/web/*/{url}") diff --git a/src/formatters/html_formatter.py b/src/formatters/html_formatter.py index 7443568..a78ff2b 100644 --- a/src/formatters/html_formatter.py +++ b/src/formatters/html_formatter.py @@ -1,6 +1,7 @@ from __future__ import annotations from dataclasses import dataclass from abc import abstractmethod +import mimetypes from metadata import Metadata from media import Media from formatters import Formatter @@ -16,14 +17,28 @@ class HtmlFormatter(Formatter): # without this STEP.__init__ is not called super().__init__(config) self.environment = Environment(loader=FileSystemLoader(os.path.join(pathlib.Path(__file__).parent.resolve(), "templates/"))) + self.environment.filters.update({ + 'is_list': is_list_jinja, + 'is_video': is_video_jinja, + 'is_image': is_image_jinja, + 'is_audio': is_audio_jinja, + 'is_media': is_media_jinja, + }) self.template = self.environment.get_template("html_template.html") @staticmethod def configs() -> dict: - return {} + return { + "detect_thumbnails": {"default": True, "help": "if true will group by thumbnails generated by thumbnail enricher by id 'thumbnail_00'"}, + + } def format(self, item: Metadata) -> Media: - print("FORMATTING") + media = item.media + # thumbnails + # TODO: thumbnails_media work per media, gah + # if self.detect_thumbnails: + content = self.template.render( url=item.get_url(), title=item.get_title(), @@ -34,3 +49,28 @@ class HtmlFormatter(Formatter): with open(html_path, mode="w", encoding="utf-8") as outf: outf.write(content) return Media(filename=html_path) + + +# JINJA helper filters + + +def is_list_jinja(v) -> bool: + return isinstance(v, list) + + +def is_video_jinja(s: str) -> bool: + m = mimetypes.guess_type(s)[0] + return "video" in (m or "") + + +def is_image_jinja(s: str) -> bool: + m = mimetypes.guess_type(s)[0] + return "image" in (m or "") + + +def is_audio_jinja(s: str) -> bool: + m = mimetypes.guess_type(s)[0] + return "audio" in (m or "") + +def is_media_jinja(v) -> bool: + return isinstance(v, Media) diff --git a/src/formatters/templates/html_template.html b/src/formatters/templates/html_template.html index e757cae..47cceae 100644 --- a/src/formatters/templates/html_template.html +++ b/src/formatters/templates/html_template.html @@ -1,5 +1,5 @@ {# templates/results.html #} - +{% import 'media.html' as macros %} @@ -55,6 +55,45 @@ font-size: large; display: none; } + + img, + video { + filter: gray; + -webkit-filter: grayscale(1); + filter: grayscale(1); + } + + /* Disable grayscale on hover */ + img:hover, + video:hover { + -webkit-filter: grayscale(0); + filter: none; + } + + .collapsible { + background-color: #777; + color: white; + cursor: pointer; + padding: 5px; + margin: 10px; + width: 100%; + border: none; + text-align: left; + outline: none; + font-size: 15px; + } + + .active, + .collapsible:hover { + background-color: #555; + } + + .collapsible-content { + padding: 0 18px; + display: none; + overflow: hidden; + background-color: #f1f1f1; + } @@ -76,37 +115,31 @@
  • type: {{ m.mimetype }}
  • {% for prop in m.properties %} - {% if m.properties[prop] | length > 1 %} + + {% if m.properties[prop] | is_list %} +

    +
    + {{ prop }}: +
    + {% for subprop in m.properties[prop] %} + {% if subprop | is_media %} + {{ macros.display_media(subprop) }} + {% else %} + {{ subprop }} + {% endif %} + {% endfor %} +
    +
    +

    + {% elif m.properties[prop] | length > 1 %}
  • {{ prop }}: {{ m.properties[prop] }}
  • {% endif %} + {% endfor %} - - {% for url in m.urls %} - {% if 'http' in url %} - {% if 'image' in m.mimetype %} - - - - {% elif 'video' in m.mimetype %} - - {% elif 'audio' in m.mimetype %} - - {% else %} - No preview available. - {% endif %} - {% endif %} -
    - open or - download - {% endfor %} + {{ macros.display_media(m) }} {% endfor %} @@ -153,10 +186,27 @@ if (e.clipboardData) { e.clipboardData.setData("text/plain", el.textContent); console.log(e.clipboardData.getData("text")) - showNotification("copied...") + showNotification("copied!") } }) }) + + // collapsibles + let coll = document.getElementsByClassName("collapsible"); + let i; + + for (i = 0; i < coll.length; i++) { + coll[i].addEventListener("click", function() { + this.classList.toggle("active"); + // let content = this.nextElementSibling; + let content = this.parentElement.querySelector(".collapsible-content"); + if (content.style.display === "block") { + content.style.display = "none"; + } else { + content.style.display = "block"; + } + }); + } \ No newline at end of file diff --git a/src/formatters/templates/media.html b/src/formatters/templates/media.html new file mode 100644 index 0000000..6ee2502 --- /dev/null +++ b/src/formatters/templates/media.html @@ -0,0 +1,28 @@ +{% macro display_media(m) -%} + +{% for url in m.urls %} +{% if url | length == 0 %} +No URL available for {{ m.key }}. +{% elif 'http' in url %} +{% if 'image' in m.mimetype %} + + + +{% elif 'video' in m.mimetype %} + +{% elif 'audio' in m.mimetype %} + +{% else %} +No preview available for {{ m.key }}. +{% endif %} +{% else %} +{{ m.url | urlize }} +{% endif %} +{% endfor %} + +{%- endmacro -%} \ No newline at end of file diff --git a/src/media.py b/src/media.py index f0f91a2..949ab6e 100644 --- a/src/media.py +++ b/src/media.py @@ -1,18 +1,17 @@ from __future__ import annotations from ast import List -from typing import Any, Union, Dict +from typing import Any from dataclasses import dataclass, field import mimetypes @dataclass class Media: - # other properties eg: hash, id, exif, ... filename: str key: str = None - _mimetype: str = None # eg: image/jpeg urls: List[str] = field(default_factory=list) + _mimetype: str = None # eg: image/jpeg properties: dict = field(default_factory=dict) def set(self, key: str, value: Any) -> Media: @@ -38,4 +37,4 @@ class Media: self._mimetype = v def is_video(self) -> bool: - return self._mimetype.startswith("video") + return self.mimetype.startswith("video") diff --git a/src/orchestrator.py b/src/orchestrator.py index 612ea2b..be77d3d 100644 --- a/src/orchestrator.py +++ b/src/orchestrator.py @@ -6,6 +6,7 @@ from dataclasses import dataclass from archivers import Archiverv2 from feeders import Feeder from formatters import Formatter +from media import Media from storages import StorageV2 from enrichers import Enricher from databases import Database @@ -177,6 +178,13 @@ class ArchivingOrchestrator: for s in self.storages: for m in result.media: s.store(m, result) # modifies media + # Media can be inside media properties, examples include transformations on original media + for prop in m.properties.values(): + if isinstance(prop, Media): + s.store(prop, result) + if isinstance(prop, list) and len(prop)>0 and isinstance(prop[0], Media): + for prop_media in prop: + s.store(prop_media, result) # formatters, enrichers, and storages will sometimes look for specific properties: eg
  • Screenshot:
  • # TODO: should there only be 1 formatter? From f1bc83818deef5c0ccbbcc8c25e092290ba16ace Mon Sep 17 00:00:00 2001 From: msramalho <19508417+msramalho@users.noreply.github.com> Date: Tue, 17 Jan 2023 17:01:25 +0000 Subject: [PATCH 037/190] template updates --- src/formatters/templates/html_template.html | 4 +++- src/formatters/templates/media.html | 9 +++++++-- 2 files changed, 10 insertions(+), 3 deletions(-) diff --git a/src/formatters/templates/html_template.html b/src/formatters/templates/html_template.html index 47cceae..3d99a41 100644 --- a/src/formatters/templates/html_template.html +++ b/src/formatters/templates/html_template.html @@ -29,6 +29,7 @@ margin: auto; border: 1px solid; border-collapse: collapse; + vertical-align:top; } table.metadata td:first-child { @@ -120,6 +121,7 @@

    {{ prop }}: +

    {% for subprop in m.properties[prop] %} {% if subprop | is_media %} @@ -139,7 +141,7 @@ - {{ macros.display_media(m) }} + {{ macros.display_media(m, true) }} {% endfor %} diff --git a/src/formatters/templates/media.html b/src/formatters/templates/media.html index 6ee2502..db071cf 100644 --- a/src/formatters/templates/media.html +++ b/src/formatters/templates/media.html @@ -1,4 +1,4 @@ -{% macro display_media(m) -%} +{% macro display_media(m, links) -%} {% for url in m.urls %} {% if url | length == 0 %} @@ -22,7 +22,12 @@ No preview available for {{ m.key }}. {% endif %} {% else %} {{ m.url | urlize }} -{% endif %} +{% endif %} +{% if links %} +
    +open or +download +{% endif %} {% endfor %} {%- endmacro -%} \ No newline at end of file From 725bab82409bf7c8390af2914e4ae9c61a13d99d Mon Sep 17 00:00:00 2001 From: msramalho <19508417+msramalho@users.noreply.github.com> Date: Wed, 18 Jan 2023 00:15:18 +0000 Subject: [PATCH 038/190] twitter archivers --- src/archivers/__init__.py | 10 +- src/archivers/telethon_archiverv2.py | 6 +- src/archivers/twitter_api_archiverv2.py | 97 +++++++++++++ src/archivers/twitter_archiverv2.py | 137 ++++++++++++++++++ src/enrichers/thumbnail_enricher.py | 1 + src/formatters/templates/html_template.html | 8 +- .../templates/{media.html => macros.html} | 6 + src/metadata.py | 18 ++- src/orchestrator.py | 9 +- 9 files changed, 273 insertions(+), 19 deletions(-) create mode 100644 src/archivers/twitter_api_archiverv2.py create mode 100644 src/archivers/twitter_archiverv2.py rename src/formatters/templates/{media.html => macros.html} (89%) diff --git a/src/archivers/__init__.py b/src/archivers/__init__.py index a2cb67c..f25668d 100644 --- a/src/archivers/__init__.py +++ b/src/archivers/__init__.py @@ -2,13 +2,15 @@ from .base_archiver import Archiver, ArchiveResult from .archiver import Archiverv2 from .telegram_archiver import TelegramArchiver -from .telethon_archiver import TelethonArchiver +# from .telethon_archiver import TelethonArchiver from .tiktok_archiver import TiktokArchiver from .wayback_archiver import WaybackArchiver from .youtubedl_archiver import YoutubeDLArchiver -from .twitter_archiver import TwitterArchiver +# from .twitter_archiver import TwitterArchiver from .vk_archiver import VkArchiver -from .twitter_api_archiver import TwitterApiArchiver +# from .twitter_api_archiver import TwitterApiArchiver from .instagram_archiver import InstagramArchiver -from .telethon_archiverv2 import TelethonArchiver \ No newline at end of file +from .telethon_archiverv2 import TelethonArchiver +from .twitter_archiverv2 import TwitterArchiver +from .twitter_api_archiverv2 import TwitterApiArchiver \ No newline at end of file diff --git a/src/archivers/telethon_archiverv2.py b/src/archivers/telethon_archiverv2.py index 819070a..90de5da 100644 --- a/src/archivers/telethon_archiverv2.py +++ b/src/archivers/telethon_archiverv2.py @@ -13,7 +13,7 @@ from media import Media class TelethonArchiver(Archiverv2): - name = "telethon" + name = "telethon_archiver" link_pattern = re.compile(r"https:\/\/t\.me(\/c){0,1}\/(.+)\/(\d+)") invite_pattern = re.compile(r"t.me(\/joinchat){0,1}\/\+?(.+)") @@ -145,8 +145,8 @@ class TelethonArchiver(Archiverv2): continue result.add_media(Media(filename)) - result.set("post", str(post)).set_title(title).set_timestamp(post.date) - return result + result.set_content(str(post)).set_title(title).set_timestamp(post.date) + return result.success("telethon") def _get_media_posts_in_group(self, chat, original_post, max_amp=10): """ diff --git a/src/archivers/twitter_api_archiverv2.py b/src/archivers/twitter_api_archiverv2.py new file mode 100644 index 0000000..c95795a --- /dev/null +++ b/src/archivers/twitter_api_archiverv2.py @@ -0,0 +1,97 @@ + +import json +from datetime import datetime +import mimetypes +import os +from loguru import logger +from pytwitter import Api +from slugify import slugify + +from metadata import Metadata +from media import Media +from .twitter_archiverv2 import TwitterArchiver +from .archiver import Archiverv2 + + +class TwitterApiArchiver(TwitterArchiver, Archiverv2): + name = "twitter_api_archiver" + + def __init__(self, config: dict) -> None: + super().__init__(config) + + if self.bearer_token: + self.api = Api(bearer_token=self.bearer_token) + elif self.consumer_key and self.consumer_secret and self.access_token and self.access_secret: + self.api = Api( + consumer_key=self.consumer_key, consumer_secret=self.consumer_secret, access_token=self.access_token, access_secret=self.access_secret) + assert hasattr(self, "api") and self.api is not None, "Missing Twitter API configurations, please provide either bearer_token OR (consumer_key, consumer_secret, access_token, access_secret) to use this archiver." + + @staticmethod + def configs() -> dict: + return { + "bearer_token": {"default": None, "help": "twitter API bearer_token which is enough for archiving, if not provided you will need consumer_key, consumer_secret, access_token, access_secret"}, + "consumer_key": {"default": None, "help": "twitter API consumer_key"}, + "consumer_secret": {"default": None, "help": "twitter API consumer_secret"}, + "access_token": {"default": None, "help": "twitter API access_token"}, + "access_secret": {"default": None, "help": "twitter API access_secret"}, + } + + def download(self, item: Metadata) -> Metadata: + url = item.get_url() + # detect URLs that we definitely cannot handle + username, tweet_id = self.get_username_tweet_id(url) + if not username: return False + + try: + tweet = self.api.get_tweet(tweet_id, expansions=["attachments.media_keys"], media_fields=["type", "duration_ms", "url", "variants"], tweet_fields=["attachments", "author_id", "created_at", "entities", "id", "text", "possibly_sensitive"]) + except Exception as e: + logger.error(f"Could not get tweet: {e}") + return False + + result = Metadata() + result.set_title(tweet.data.text) + result.set_timestamp(datetime.strptime(tweet.data.created_at, "%Y-%m-%dT%H:%M:%S.%fZ")) + + urls = [] + if tweet.includes: + for i, m in enumerate(tweet.includes.media): + media = Media(filename="") + if m.url and len(m.url): + media.set("src", m.url) + media.set("duration", (m.duration_ms or 1) // 1000) + mimetype = "image/jpeg" + elif hasattr(m, "variants"): + variant = self.choose_variant(m.variants) + if not variant: continue + media.set("src", variant.url) + mimetype = variant.content_type + else: + continue + logger.info(f"Found media {media}") + ext = mimetypes.guess_extension(mimetype) + media.filename = os.path.join(item.get_tmp_dir(), f'{slugify(url)}_{i}{ext}') + self.download_from_url(media.get("src"), media.filename) + result.add_media(media) + + result.set_content(json.dumps({ + "id": tweet.data.id, + "text": tweet.data.text, + "created_at": tweet.data.created_at, + "author_id": tweet.data.author_id, + "geo": tweet.data.geo, + "lang": tweet.data.lang, + "media": urls + }, ensure_ascii=False, indent=4)) + return result.success("twitter") + + def choose_variant(self, variants): + # choosing the highest quality possible + variant, bit_rate = None, -1 + for var in variants: + if var.content_type == "video/mp4": + if var.bit_rate > bit_rate: + bit_rate = var.bit_rate + variant = var + else: + variant = var if not variant else variant + return variant diff --git a/src/archivers/twitter_archiverv2.py b/src/archivers/twitter_archiverv2.py new file mode 100644 index 0000000..f23fa0f --- /dev/null +++ b/src/archivers/twitter_archiverv2.py @@ -0,0 +1,137 @@ +import html, re, requests +import mimetypes +import json +import os +from datetime import datetime +from loguru import logger +from metadata import Metadata +from snscrape.modules.twitter import TwitterTweetScraper, Video, Gif, Photo +from archivers import Archiverv2 +from media import Media +from slugify import slugify + + +class TwitterArchiver(Archiverv2): + """ + This Twitter Archiver uses unofficial scraping methods. + """ + + name = "twitter_archiver" + link_pattern = re.compile(r"twitter.com\/(?:\#!\/)?(\w+)\/status(?:es)?\/(\d+)") + + def __init__(self, config: dict) -> None: + super().__init__(config) + + @staticmethod + def configs() -> dict: + return {} + + def download(self, item: Metadata) -> Metadata: + """ + if this url is archivable will download post info and look for other posts from the same group with media. + can handle private/public channels + """ + url = item.get_url() + # detect URLs that we definitely cannot handle + username, tweet_id = self.get_username_tweet_id(url) + if not username: return False + + result = Metadata() + + scr = TwitterTweetScraper(tweet_id) + try: + tweet = next(scr.get_items()) + except Exception as ex: + logger.warning(f"can't get tweet: {type(ex).__name__} occurred. args: {ex.args}") + return self.download_alternative(item, url, tweet_id) + + result.set_title(tweet.content).set_content(tweet.json()).set_timestamp(tweet.date) + if tweet.media is None: + logger.debug(f'No media found, archiving tweet text only') + return result + + for i, tweet_media in enumerate(tweet.media): + media = Media(filename="") + mimetype = "" + if type(tweet_media) == Video: + variant = max( + [v for v in tweet_media.variants if v.bitrate], key=lambda v: v.bitrate) + media.set("src", variant.url).set("duration", tweet_media.duration) + mimetype = variant.contentType + elif type(tweet_media) == Gif: + variant = tweet_media.variants[0] + media.set("src", variant.url) + mimetype = variant.contentType + elif type(tweet_media) == Photo: + media.set("src", tweet_media.fullUrl.replace('name=large', 'name=orig')) + mimetype = "image/jpeg" + else: + logger.warning(f"Could not get media URL of {tweet_media}") + continue + ext = mimetypes.guess_extension(mimetype) + media.filename = os.path.join(item.get_tmp_dir(), f'{slugify(url)}_{i}{ext}') + self.download_from_url(media.get("src"), media.filename) + result.add_media(media) + + return result.success("twitter") + + def download_alternative(self, item: Metadata, url: str, tweet_id: str) -> Metadata: + """ + CURRENTLY STOPPED WORKING + """ + return False + # https://stackoverflow.com/a/71867055/6196010 + logger.debug(f"Trying twitter hack for {url=}") + result = Metadata() + + hack_url = f"https://cdn.syndication.twimg.com/tweet?id={tweet_id}" + r = requests.get(hack_url) + if r.status_code != 200: return False + tweet = r.json() + + urls = [] + for p in tweet["photos"]: + urls.append(p["url"]) + + # 1 tweet has 1 video max + if "video" in tweet: + v = tweet["video"] + urls.append(self.choose_variant(v.get("variants", []))) + + logger.debug(f"Twitter hack got {urls=}") + + for u in urls: + media = Media() + media.set("src", u) + media.filename = os.path.join(item.get_tmp_dir(), f'{slugify(url)}_{i}') + self.download_from_url(u, media.filename) + result.add_media(media) + + # .set_title(tweet["TODO"]) + result.set_content(json.dumps(tweet, ensure_ascii=False)).set_timestamp(datetime.strptime(tweet["created_at"], "%Y-%m-%dT%H:%M:%S.%fZ")) + return result + + def get_username_tweet_id(self, url): + # detect URLs that we definitely cannot handle + matches = self.link_pattern.findall(url) + if not len(matches): return False, False + + username, tweet_id = matches[0] # only one URL supported + logger.debug(f"Found {username=} and {tweet_id=} in {url=}") + + return username, tweet_id + + def choose_variant(self, variants): + # choosing the highest quality possible + variant, width, height = None, 0, 0 + for var in variants: + if var.get("type", "") == "video/mp4": + width_height = re.search(r"\/(\d+)x(\d+)\/", var["src"]) + if width_height: + w, h = int(width_height[1]), int(width_height[2]) + if w > width or h > height: + width, height = w, h + variant = var.get("src", variant) + else: + variant = var.get("src") if not variant else variant + return variant diff --git a/src/enrichers/thumbnail_enricher.py b/src/enrichers/thumbnail_enricher.py index 32e09be..94c5ee7 100644 --- a/src/enrichers/thumbnail_enricher.py +++ b/src/enrichers/thumbnail_enricher.py @@ -25,6 +25,7 @@ class ThumbnailEnricher(Enricher): folder = os.path.join(to_enrich.get_tmp_dir(), str(uuid.uuid4())) os.makedirs(folder, exist_ok=True) for i, m in enumerate(to_enrich.media[::]): + logger.info(m) if m.is_video(): logger.debug(f"generating thumbnails for {m.filename}") fps, duration = 0.5, m.get("duration") diff --git a/src/formatters/templates/html_template.html b/src/formatters/templates/html_template.html index 3d99a41..9c3b54e 100644 --- a/src/formatters/templates/html_template.html +++ b/src/formatters/templates/html_template.html @@ -1,5 +1,5 @@ {# templates/results.html #} -{% import 'media.html' as macros %} +{% import 'macros.html' as macros %} @@ -133,8 +133,8 @@

    - {% elif m.properties[prop] | length > 1 %} -
  • {{ prop }}: {{ m.properties[prop] }}
  • + {% elif m.properties[prop] | string | length > 1 %} +
  • {{ prop }}: {{ macros.copy_urlize(m.properties[prop]) }}
  • {% endif %} {% endfor %} @@ -156,7 +156,7 @@ {{ key }} - {{ metadata[key] | urlize }} + {{ macros.copy_urlize(metadata[key]) }} {% endfor %} diff --git a/src/formatters/templates/media.html b/src/formatters/templates/macros.html similarity index 89% rename from src/formatters/templates/media.html rename to src/formatters/templates/macros.html index db071cf..d57283b 100644 --- a/src/formatters/templates/media.html +++ b/src/formatters/templates/macros.html @@ -30,4 +30,10 @@ No preview available for {{ m.key }}. {% endif %} {% endfor %} +{%- endmacro -%} + +{% macro copy_urlize(val) -%} + +{{ val | string | urlize }} + {%- endmacro -%} \ No newline at end of file diff --git a/src/metadata.py b/src/metadata.py index 70984fa..ba37438 100644 --- a/src/metadata.py +++ b/src/metadata.py @@ -12,10 +12,10 @@ from media import Media @dataclass class Metadata: - status: str = "" + status: str = "no archiver" _processed_at: datetime = datetime.datetime.utcnow().replace(tzinfo=datetime.timezone.utc) metadata: Dict[str, Any] = field(default_factory=dict) - tmp_keys: Set[str] = field(default_factory=set) # keys that are not to be saved in DBs + tmp_keys: Set[str] = field(default_factory=set, repr=False) # keys that are not to be saved in DBs media: List[Media] = field(default_factory=list) final_media: Media = None # can be overwritten by formatters rearchivable: bool = False @@ -28,7 +28,7 @@ class Metadata: """ merges two Metadata instances, will overwrite according to overwrite_left flag """ - if right is None: return self + if not right: return self if overwrite_left: if right.status and len(right.status): self.status = right.status @@ -58,8 +58,18 @@ class Metadata: self.metadata[key] = default return self.metadata.get(key, default) + def success(self, context: str = None) -> Metadata: + if context: self.status = f"{context}: success" + else: self.status = "success" + return self + + def is_success(self) -> bool: + return "success" in self.status + + # custom getter/setters + def set_url(self, url: str) -> Metadata: assert type(url) is str and len(url) > 0, "invalid URL" return self.set("url", url) @@ -70,7 +80,7 @@ class Metadata: return url def set_content(self, content: str) -> Metadata: - # the main textual content/information from a social media post, webpage, ... + # a dump with all the relevant content return self.set("content", content) def set_title(self, title: str) -> Metadata: diff --git a/src/orchestrator.py b/src/orchestrator.py index be77d3d..fb28cfb 100644 --- a/src/orchestrator.py +++ b/src/orchestrator.py @@ -112,8 +112,8 @@ class ArchivingOrchestrator: logger.error(f'Got unexpected error on item {item}: {e}\n{traceback.format_exc()}') for d in self.databases: d.failed(item) - print("holding on 5min") - time.sleep(300) + print("holding on 5s") + time.sleep(5) # how does this handle the parameters like folder which can be different for each archiver? # the storage needs to know where to archive!! @@ -161,9 +161,10 @@ class ArchivingOrchestrator: # this is where the Hashes come from, the place with access to all content # the archiver does not have access to storage # a.download(result) # TODO: refactor so there's not merge here + logger.info(f"Trying archiver {a.name}") result.merge(a.download(result)) - # TODO: fix logic - if True or result.is_success(): break + # TODO: fix logic to halt when done + if result.is_success(): break # what if an archiver returns multiple entries and one is to be part of HTMLgenerator? # should it call the HTMLgenerator as if it's not an enrichment? From 1def8bb03dde9a2609f79ae91e8a94d33692256a Mon Sep 17 00:00:00 2001 From: msramalho <19508417+msramalho@users.noreply.github.com> Date: Wed, 18 Jan 2023 16:16:23 +0000 Subject: [PATCH 039/190] instagram archiver --- src/archivers/__init__.py | 5 +- src/archivers/instagram_archiverv2.py | 144 ++++++++++++++++++++++++ src/archivers/telethon_archiverv2.py | 4 +- src/archivers/twitter_api_archiverv2.py | 5 + src/steps/step.py | 8 ++ 5 files changed, 162 insertions(+), 4 deletions(-) create mode 100644 src/archivers/instagram_archiverv2.py diff --git a/src/archivers/__init__.py b/src/archivers/__init__.py index f25668d..d256f8c 100644 --- a/src/archivers/__init__.py +++ b/src/archivers/__init__.py @@ -9,8 +9,9 @@ from .youtubedl_archiver import YoutubeDLArchiver # from .twitter_archiver import TwitterArchiver from .vk_archiver import VkArchiver # from .twitter_api_archiver import TwitterApiArchiver -from .instagram_archiver import InstagramArchiver +# from .instagram_archiver import InstagramArchiver from .telethon_archiverv2 import TelethonArchiver from .twitter_archiverv2 import TwitterArchiver -from .twitter_api_archiverv2 import TwitterApiArchiver \ No newline at end of file +from .twitter_api_archiverv2 import TwitterApiArchiver +from .instagram_archiverv2 import InstagramArchiver \ No newline at end of file diff --git a/src/archivers/instagram_archiverv2.py b/src/archivers/instagram_archiverv2.py new file mode 100644 index 0000000..2ca2e80 --- /dev/null +++ b/src/archivers/instagram_archiverv2.py @@ -0,0 +1,144 @@ +import re, os, shutil, html, traceback +import instaloader # https://instaloader.github.io/as-module.html +from loguru import logger + +from metadata import Metadata +from media import Media +from .archiver import Archiverv2 + + +class InstagramArchiver(Archiverv2): + """ + Uses Instaloader to download either a post (inc images, videos, text) or as much as possible from a profile (posts, stories, highlights, ...) + """ + name = "instagram_archiver" + + # NB: post regex should be tested before profile + # https://regex101.com/r/MGPquX/1 + post_pattern = re.compile(r"(?:(?:http|https):\/\/)?(?:www.)?(?:instagram.com|instagr.am|instagr.com)\/(?:p|reel)\/(\w+)") + # https://regex101.com/r/6Wbsxa/1 + profile_pattern = re.compile(r"(?:(?:http|https):\/\/)?(?:www.)?(?:instagram.com|instagr.am|instagr.com)\/(\w+)") + # TODO: links to stories + + def __init__(self, config: dict) -> None: + super().__init__(config) + # TODO: refactor how configuration validation is done + self.assert_valid_string("username") + self.assert_valid_string("password") + self.assert_valid_string("download_folder") + self.assert_valid_string("session_file") + self.insta = instaloader.Instaloader( + download_geotags=True, download_comments=True, compress_json=False, dirname_pattern=self.download_folder, filename_pattern="{date_utc}_UTC_{target}__{typename}" + ) + try: + self.insta.load_session_from_file(self.username, self.session_file) + except Exception as e: + logger.error(f"Unable to login from session file: {e}\n{traceback.format_exc()}") + try: + self.insta.login(self.username, config.instagram_self.password) + # TODO: wait for this issue to be fixed https://github.com/instaloader/instaloader/issues/1758 + self.insta.save_session_to_file(self.session_file) + except Exception as e2: + logger.error(f"Unable to finish login (retrying from file): {e2}\n{traceback.format_exc()}") + + @staticmethod + def configs() -> dict: + return { + "username": {"default": None, "help": "a valid Instagram username"}, + "password": {"default": None, "help": "the corresponding Instagram account password"}, + "download_folder": {"default": "instaloader", "help": "name of a folder to temporarily download content to"}, + "session_file": {"default": "secrets/instaloader.session", "help": "path to the instagram session which saves session credentials"}, + #TODO: fine-grain + # "download_stories": {"default": True, "help": "if the link is to a user profile: whether to get stories information"}, + } + + def download(self, item: Metadata) -> Metadata: + url = item.get_url() + + # detect URLs that we definitely cannot handle + post_matches = self.post_pattern.findall(url) + profile_matches = self.profile_pattern.findall(url) + + # return if not a valid instagram link + if not len(post_matches) and not len(profile_matches): return + + result = None + try: + os.makedirs(self.download_folder, exist_ok=True) + # process if post + if len(post_matches): + result = self.download_post(url, post_matches[0]) + # process if profile + elif len(profile_matches): + result = self.download_profile(url, profile_matches[0]) + except Exception as e: + logger.error(f"Failed to download with instagram archiver due to: {e}, make sure your account credentials are valid.") + finally: + shutil.rmtree(self.download_folder, ignore_errors=True) + return result + + def download_post(self, url: str, post_id: str) -> Metadata: + logger.debug(f"Instagram {post_id=} detected in {url=}") + + post = instaloader.Post.from_shortcode(self.insta.context, post_id) + if self.insta.download_post(post, target=post.owner_username): + return self.process_downloads(url, post.title, post._asdict(), post.date) + + def download_profile(self, url: str, username: str) -> Metadata: + # gets posts, posts where username is tagged, igtv postss, stories, and highlights + logger.debug(f"Instagram {username=} detected in {url=}") + + profile = instaloader.Profile.from_username(self.insta.context, username) + try: + for post in profile.get_posts(): + try: self.insta.download_post(post, target=f"profile_post_{post.owner_username}") + except Exception as e: logger.error(f"Failed to download post: {post.shortcode}: {e}") + except Exception as e: logger.error(f"Failed profile.get_posts: {e}") + + try: + for post in profile.get_tagged_posts(): + try: self.insta.download_post(post, target=f"tagged_post_{post.owner_username}") + except Exception as e: logger.error(f"Failed to download tagged post: {post.shortcode}: {e}") + except Exception as e: logger.error(f"Failed profile.get_tagged_posts: {e}") + + try: + for post in profile.get_igtv_posts(): + try: self.insta.download_post(post, target=f"igtv_post_{post.owner_username}") + except Exception as e: logger.error(f"Failed to download igtv post: {post.shortcode}: {e}") + except Exception as e: logger.error(f"Failed profile.get_igtv_posts: {e}") + + try: + for story in self.insta.get_stories([profile.userid]): + for item in story.get_items(): + try: self.insta.download_storyitem(item, target=f"story_item_{story.owner_username}") + except Exception as e: logger.error(f"Failed to download story item: {item}: {e}") + except Exception as e: logger.error(f"Failed get_stories: {e}") + + try: + for highlight in self.insta.get_highlights(profile.userid): + for item in highlight.get_items(): + try: self.insta.download_storyitem(item, target=f"highlight_item_{highlight.owner_username}") + except Exception as e: logger.error(f"Failed to download highlight item: {item}: {e}") + except Exception as e: logger.error(f"Failed get_highlights: {e}") + + return self.process_downloads(url, f"@{username}", profile._asdict(), None) + + def process_downloads(self, url, title, content, date): + result = Metadata() + result.set_title(title).set_content(str(content)).set_timestamp(date) + + try: + all_media = [] + for f in os.listdir(self.download_folder): + if os.path.isfile((filename := os.path.join(self.download_folder, f))): + if filename[-4:] == ".txt": continue + all_media.append(Media(filename)) + + assert len(all_media) > 1, "No uploaded media found" + all_media.sort(key=lambda m: m.filename, reverse=True) + for m in all_media: + result.add_media(m) + + return result.success("instagram") + except Exception as e: + logger.error(f"Could not fetch instagram post {url} due to: {e}") diff --git a/src/archivers/telethon_archiverv2.py b/src/archivers/telethon_archiverv2.py index 90de5da..094b004 100644 --- a/src/archivers/telethon_archiverv2.py +++ b/src/archivers/telethon_archiverv2.py @@ -19,8 +19,8 @@ class TelethonArchiver(Archiverv2): def __init__(self, config: dict) -> None: super().__init__(config) - assert self.api_id is not None and type(self.api_id) == str and len(self.api_id) > 0, f"invalid telethon api_id value ({self.api_id}) should be a valid string" - assert self.api_hash is not None and type(self.api_hash) == str and len(self.api_hash) > 0, f"invalid telethon api_hash value ({self.api_hash}) should be a valid string" + self.assert_valid_string("api_id") + self.assert_valid_string("api_hash") self.client = TelegramClient(self.session_file, self.api_id, self.api_hash) diff --git a/src/archivers/twitter_api_archiverv2.py b/src/archivers/twitter_api_archiverv2.py index c95795a..5cfbc0d 100644 --- a/src/archivers/twitter_api_archiverv2.py +++ b/src/archivers/twitter_api_archiverv2.py @@ -20,8 +20,13 @@ class TwitterApiArchiver(TwitterArchiver, Archiverv2): super().__init__(config) if self.bearer_token: + self.assert_valid_string("bearer_token") self.api = Api(bearer_token=self.bearer_token) elif self.consumer_key and self.consumer_secret and self.access_token and self.access_secret: + self.assert_valid_string("consumer_key") + self.assert_valid_string("consumer_secret") + self.assert_valid_string("access_token") + self.assert_valid_string("access_secret") self.api = Api( consumer_key=self.consumer_key, consumer_secret=self.consumer_secret, access_token=self.access_token, access_secret=self.access_secret) assert hasattr(self, "api") and self.api is not None, "Missing Twitter API configurations, please provide either bearer_token OR (consumer_key, consumer_secret, access_token, access_secret) to use this archiver." diff --git a/src/steps/step.py b/src/steps/step.py index a8bad38..e80437b 100644 --- a/src/steps/step.py +++ b/src/steps/step.py @@ -29,3 +29,11 @@ class Step(ABC): if sub.name == name: return sub(config) raise ClassFoundException(f"Unable to initialize STEP with {name=}, check your configuration file/step names.") + + def assert_valid_string(self, prop: str) -> None: + """ + receives a property name an ensures it exists and is a valid non-empty string, raises an exception if not + """ + assert hasattr(self, prop), f"property {prop} not found" + s = getattr(self, prop) + assert s is not None and type(s) == str and len(s) > 0, f"invalid property {prop} value '{s}', it should be a valid string" From 63d1abbe4b42166973849c1626bba162ee4b41b1 Mon Sep 17 00:00:00 2001 From: msramalho <19508417+msramalho@users.noreply.github.com> Date: Wed, 18 Jan 2023 16:56:35 +0000 Subject: [PATCH 040/190] tiktok archiver though info is no longer working --- src/archivers/__init__.py | 5 ++- src/archivers/tiktok_archiverv2.py | 59 ++++++++++++++++++++++++++++++ 2 files changed, 62 insertions(+), 2 deletions(-) create mode 100644 src/archivers/tiktok_archiverv2.py diff --git a/src/archivers/__init__.py b/src/archivers/__init__.py index d256f8c..ce9b701 100644 --- a/src/archivers/__init__.py +++ b/src/archivers/__init__.py @@ -3,7 +3,7 @@ from .base_archiver import Archiver, ArchiveResult from .archiver import Archiverv2 from .telegram_archiver import TelegramArchiver # from .telethon_archiver import TelethonArchiver -from .tiktok_archiver import TiktokArchiver +# from .tiktok_archiver import TiktokArchiver from .wayback_archiver import WaybackArchiver from .youtubedl_archiver import YoutubeDLArchiver # from .twitter_archiver import TwitterArchiver @@ -14,4 +14,5 @@ from .vk_archiver import VkArchiver from .telethon_archiverv2 import TelethonArchiver from .twitter_archiverv2 import TwitterArchiver from .twitter_api_archiverv2 import TwitterApiArchiver -from .instagram_archiverv2 import InstagramArchiver \ No newline at end of file +from .instagram_archiverv2 import InstagramArchiver +from .tiktok_archiverv2 import TiktokArchiver \ No newline at end of file diff --git a/src/archivers/tiktok_archiverv2.py b/src/archivers/tiktok_archiverv2.py new file mode 100644 index 0000000..85d3083 --- /dev/null +++ b/src/archivers/tiktok_archiverv2.py @@ -0,0 +1,59 @@ +import json +import os, traceback +import re +import uuid +import tiktok_downloader +from loguru import logger + +from metadata import Metadata +from media import Media +from .archiver import Archiverv2 + + +class TiktokArchiver(Archiverv2): + name = "tiktok_archiver" + + def __init__(self, config: dict) -> None: + super().__init__(config) + + @staticmethod + def configs() -> dict: + return {} + + def download(self, item: Metadata) -> Metadata: + url = item.get_url() + if 'tiktok.com' not in url: + return False + + result = Metadata() + try: + info = tiktok_downloader.info_post(url) + result.set_title(info.desc) + result.set_timestamp(info.create_time) + result.set_content(json.dumps({ + "cover": info.cover, + "author": info.author, + "music_title": info.author, + "caption": getattr(info, "caption", info.desc), + }, ensure_ascii=False, indent=4)) + except: + error = traceback.format_exc() + logger.warning(f'Other Tiktok error {error}') + + + try: + filename = os.path.join(item.get_tmp_dir(), f'{str(uuid.uuid4())[0:8]}.mp4') + tiktok_media = tiktok_downloader.snaptik(url).get_media() + + if len(tiktok_media) <= 0: + logger.debug(f"TikTok: could not get media from {url=}") + return False + + logger.info(f'downloading video {filename=}') + tiktok_media[0].download(filename) + + result.add_media(Media(filename)) + return result.success("tiktok") + except: + error = traceback.format_exc() + logger.warning(f'Other Tiktok error {error}') From 085376f63f60b13130b626cca0fe9e8444bb3e3e Mon Sep 17 00:00:00 2001 From: msramalho <19508417+msramalho@users.noreply.github.com> Date: Wed, 18 Jan 2023 21:14:20 +0000 Subject: [PATCH 041/190] telegram archiver --- src/archivers/__init__.py | 5 +- src/archivers/archiver.py | 15 ++++- src/archivers/telegram_archiverv2.py | 75 +++++++++++++++++++++++++ src/archivers/telethon_archiverv2.py | 7 +-- src/archivers/twitter_api_archiverv2.py | 3 +- src/archivers/twitter_archiverv2.py | 6 +- src/metadata.py | 10 +++- 7 files changed, 105 insertions(+), 16 deletions(-) create mode 100644 src/archivers/telegram_archiverv2.py diff --git a/src/archivers/__init__.py b/src/archivers/__init__.py index ce9b701..8fb0265 100644 --- a/src/archivers/__init__.py +++ b/src/archivers/__init__.py @@ -1,7 +1,7 @@ # we need to explicitly expose the available imports here from .base_archiver import Archiver, ArchiveResult from .archiver import Archiverv2 -from .telegram_archiver import TelegramArchiver +# from .telegram_archiver import TelegramArchiver # from .telethon_archiver import TelethonArchiver # from .tiktok_archiver import TiktokArchiver from .wayback_archiver import WaybackArchiver @@ -15,4 +15,5 @@ from .telethon_archiverv2 import TelethonArchiver from .twitter_archiverv2 import TwitterArchiver from .twitter_api_archiverv2 import TwitterApiArchiver from .instagram_archiverv2 import InstagramArchiver -from .tiktok_archiverv2 import TiktokArchiver \ No newline at end of file +from .tiktok_archiverv2 import TiktokArchiver +from .telegram_archiverv2 import TelegramArchiver \ No newline at end of file diff --git a/src/archivers/archiver.py b/src/archivers/archiver.py index f16464a..369dd60 100644 --- a/src/archivers/archiver.py +++ b/src/archivers/archiver.py @@ -1,6 +1,7 @@ from __future__ import annotations from abc import abstractmethod from dataclasses import dataclass +import os from metadata import Metadata from steps.step import Step import mimetypes, requests @@ -23,7 +24,7 @@ class Archiverv2(Step): # used when archivers need to login or do other one-time setup pass - def clean_url(self, url:str) -> str: + def clean_url(self, url: str) -> str: # used to clean unnecessary URL parameters return url @@ -37,13 +38,23 @@ class Archiverv2(Step): return mime.split("/")[0] return "" - def download_from_url(self, url:str, to_filename:str) -> None: + def download_from_url(self, url: str, to_filename: str = None, item: Metadata = None) -> str: + """ + downloads a URL to provided filename, or inferred from URL, returns local filename, if item is present will use its tmp_dir + """ + if not to_filename: + to_filename = url.split('/')[-1].split('?')[0] + if len(to_filename) > 64: + to_filename = to_filename[-64:] + if item: + to_filename = os.path.join(item.get_tmp_dir(), to_filename) headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36' } d = requests.get(url, headers=headers) with open(to_filename, 'wb') as f: f.write(d.content) + return to_filename @abstractmethod def download(self, item: Metadata) -> Metadata: pass diff --git a/src/archivers/telegram_archiverv2.py b/src/archivers/telegram_archiverv2.py new file mode 100644 index 0000000..674fa26 --- /dev/null +++ b/src/archivers/telegram_archiverv2.py @@ -0,0 +1,75 @@ +import requests, re + +import html +from bs4 import BeautifulSoup +from loguru import logger + +from metadata import Metadata +from media import Media +from .archiver import Archiverv2 + + +class TelegramArchiver(Archiverv2): + """ + Archiver for telegram that does not require login, but the telethon_archiver is much more advised, will only return if at least one image or one video is found + """ + name = "telegram_archiver" + + def __init__(self, config: dict) -> None: + super().__init__(config) + + @staticmethod + def configs() -> dict: + return {} + + def download(self, item: Metadata) -> Metadata: + url = item.get_url() + # detect URLs that we definitely cannot handle + if 't.me' != item.netloc: + return False + + headers = { + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36' + } + + # TODO: check if we can do this more resilient to variable URLs + if url[-8:] != "?embed=1": + url += "?embed=1" + + t = requests.get(url, headers=headers) + s = BeautifulSoup(t.content, 'html.parser') + + result = Metadata() + result.set_content(html.escape(str(t.content))) + if (timestamp := (s.find_all('time') or [{}])[0].get('datetime')): + result.set_timestamp(timestamp) + + video = s.find("video") + if video is None: + logger.warning("could not find video") + image_tags = s.find_all(class_="js-message_photo") + + image_urls = [] + for im in image_tags: + urls = [u.replace("'", "") for u in re.findall(r'url\((.*?)\)', im['style'])] + image_urls += urls + + if not len(image_urls): return False + for img_url in image_urls: + result.add_media(Media(self.download_from_url(img_url))) + else: + video_url = video.get('src') + m_video = Media(self.download_from_url(video_url)) + # extract duration from HTML + try: + duration = s.find_all('time')[0].contents[0] + if ':' in duration: + duration = float(duration.split( + ':')[0]) * 60 + float(duration.split(':')[1]) + else: + duration = float(duration) + m_video.set("duration", duration) + except: pass + result.add_media(m_video) + + return result.success("telegram") diff --git a/src/archivers/telethon_archiverv2.py b/src/archivers/telethon_archiverv2.py index 094b004..3f698e1 100644 --- a/src/archivers/telethon_archiverv2.py +++ b/src/archivers/telethon_archiverv2.py @@ -117,7 +117,7 @@ class TelethonArchiver(Archiverv2): if post is None: return False logger.info(f"fetched telegram {post.id=}") - + media_posts = self._get_media_posts_in_group(chat, post) logger.debug(f'got {len(media_posts)=} for {url=}') @@ -126,7 +126,7 @@ class TelethonArchiver(Archiverv2): group_id = post.grouped_id if post.grouped_id is not None else post.id title = post.message for mp in media_posts: - if len(mp.message) > len(title): title = mp.message # save the longest text found (usually only 1) + if len(mp.message) > len(title): title = mp.message # save the longest text found (usually only 1) # media can also be in entities if mp.entities: @@ -134,8 +134,7 @@ class TelethonArchiver(Archiverv2): if len(other_media_urls): logger.debug(f"Got {len(other_media_urls)} other media urls from {mp.id=}: {other_media_urls}") for i, om_url in enumerate(other_media_urls): - filename = os.path.join(tmp_dir, f'{chat}_{group_id}_{i}') - self.download_from_url(om_url, filename) + filename = self.download_from_url(om_url, f'{chat}_{group_id}_{i}', item) result.add_media(Media(filename=filename), id=f"{group_id}_{i}") filename_dest = os.path.join(tmp_dir, f'{chat}_{group_id}', str(mp.id)) diff --git a/src/archivers/twitter_api_archiverv2.py b/src/archivers/twitter_api_archiverv2.py index 5cfbc0d..1f43935 100644 --- a/src/archivers/twitter_api_archiverv2.py +++ b/src/archivers/twitter_api_archiverv2.py @@ -74,8 +74,7 @@ class TwitterApiArchiver(TwitterArchiver, Archiverv2): continue logger.info(f"Found media {media}") ext = mimetypes.guess_extension(mimetype) - media.filename = os.path.join(item.get_tmp_dir(), f'{slugify(url)}_{i}{ext}') - self.download_from_url(media.get("src"), media.filename) + media.filename = self.download_from_url(media.get("src"), f'{slugify(url)}_{i}{ext}', item) result.add_media(media) result.set_content(json.dumps({ diff --git a/src/archivers/twitter_archiverv2.py b/src/archivers/twitter_archiverv2.py index f23fa0f..d537fe4 100644 --- a/src/archivers/twitter_archiverv2.py +++ b/src/archivers/twitter_archiverv2.py @@ -69,8 +69,7 @@ class TwitterArchiver(Archiverv2): logger.warning(f"Could not get media URL of {tweet_media}") continue ext = mimetypes.guess_extension(mimetype) - media.filename = os.path.join(item.get_tmp_dir(), f'{slugify(url)}_{i}{ext}') - self.download_from_url(media.get("src"), media.filename) + media.filename = self.download_from_url(media.get("src"), f'{slugify(url)}_{i}{ext}', item) result.add_media(media) return result.success("twitter") @@ -103,8 +102,7 @@ class TwitterArchiver(Archiverv2): for u in urls: media = Media() media.set("src", u) - media.filename = os.path.join(item.get_tmp_dir(), f'{slugify(url)}_{i}') - self.download_from_url(u, media.filename) + media.filename = self.download_from_url(u, f'{slugify(url)}_{i}', item) result.add_media(media) # .set_title(tweet["TODO"]) diff --git a/src/metadata.py b/src/metadata.py index ba37438..2293eb3 100644 --- a/src/metadata.py +++ b/src/metadata.py @@ -4,9 +4,9 @@ from ast import List, Set from typing import Any, Union, Dict from dataclasses import dataclass, field import datetime, mimetypes +from urllib.parse import urlparse from loguru import logger -# import json - +from dateutil.parser import parse as parse_dt from media import Media @@ -66,6 +66,10 @@ class Metadata: def is_success(self) -> bool: return "success" in self.status + @property # getter .netloc + def netloc(self) -> str: + return urlparse(self.get_url()).netloc + # custom getter/setters @@ -96,6 +100,8 @@ class Metadata: return self.get("tmp_dir") def set_timestamp(self, timestamp: datetime.datetime) -> Metadata: + if type(timestamp) == str: + timestamp = parse_dt(timestamp) assert type(timestamp) == datetime.datetime, "set_timestamp expects a datetime instance" return self.set("timestamp", timestamp) From eb0859fbaf66782f18419a016cedcca905fbbccd Mon Sep 17 00:00:00 2001 From: msramalho <19508417+msramalho@users.noreply.github.com> Date: Wed, 18 Jan 2023 21:34:40 +0000 Subject: [PATCH 042/190] vk archiver --- src/archivers/__init__.py | 5 ++- src/archivers/vk_archiverv2.py | 67 ++++++++++++++++++++++++++++++++++ src/utils/misc.py | 11 ++++++ 3 files changed, 81 insertions(+), 2 deletions(-) create mode 100644 src/archivers/vk_archiverv2.py diff --git a/src/archivers/__init__.py b/src/archivers/__init__.py index 8fb0265..d2a2c49 100644 --- a/src/archivers/__init__.py +++ b/src/archivers/__init__.py @@ -7,7 +7,7 @@ from .archiver import Archiverv2 from .wayback_archiver import WaybackArchiver from .youtubedl_archiver import YoutubeDLArchiver # from .twitter_archiver import TwitterArchiver -from .vk_archiver import VkArchiver +# from .vk_archiver import VkArchiver # from .twitter_api_archiver import TwitterApiArchiver # from .instagram_archiver import InstagramArchiver @@ -16,4 +16,5 @@ from .twitter_archiverv2 import TwitterArchiver from .twitter_api_archiverv2 import TwitterApiArchiver from .instagram_archiverv2 import InstagramArchiver from .tiktok_archiverv2 import TiktokArchiver -from .telegram_archiverv2 import TelegramArchiver \ No newline at end of file +from .telegram_archiverv2 import TelegramArchiver +from .vk_archiverv2 import VkArchiver \ No newline at end of file diff --git a/src/archivers/vk_archiverv2.py b/src/archivers/vk_archiverv2.py new file mode 100644 index 0000000..147424d --- /dev/null +++ b/src/archivers/vk_archiverv2.py @@ -0,0 +1,67 @@ +import re, json, mimetypes, os + +from loguru import logger +from vk_url_scraper import VkScraper, DateTimeEncoder + +from metadata import Metadata +from media import Media +from utils.misc import dump_payload +from .archiver import Archiverv2 + + +class VkArchiver(Archiverv2): + """" + VK videos are handled by YTDownloader, this archiver gets posts text and images. + Currently only works for /wall posts + """ + name = "vk_archiver" + wall_pattern = re.compile(r"(wall.{0,1}\d+_\d+)") + photo_pattern = re.compile(r"(photo.{0,1}\d+_\d+)") + + def __init__(self, config: dict) -> None: + super().__init__(config) + self.assert_valid_string("username") + self.assert_valid_string("password") + self.vks = VkScraper(self.username, self.password, session_file=self.session_file) + + @staticmethod + def configs() -> dict: + return { + "username": {"default": None, "help": "valid VKontakte username"}, + "password": {"default": None, "help": "valid VKontakte password"}, + "session_file": {"default": "secrets/vk_config.v2.json", "help": "valid VKontakte password"}, + } + + def download(self, item: Metadata) -> Metadata: + url = item.get_url() + + if "vk.com" not in item.netloc: return False + + # some urls can contain multiple wall/photo/... parts and all will be fetched + vk_scrapes = self.vks.scrape(url) + if not len(vk_scrapes): return False + + result = Metadata() + for scrape in vk_scrapes: + if not result.get_title(): + result.set_title(scrape["text"]) + if not result.get_timestamp(): + result.set_timestamp(scrape["datetime"]) + + result.set_content(dump_payload(vk_scrapes)) + + textual_output = "" + title, datetime = vk_scrapes[0]["text"], vk_scrapes[0]["datetime"] + urls_found = [] + for scrape in vk_scrapes: + textual_output += f"id: {scrape['id']}
    time utc: {scrape['datetime']}
    text: {scrape['text']}
    payload: {dump_payload(scrape['payload'])}


    " + title = scrape["text"] if len(title) == 0 else title + datetime = scrape["datetime"] if not datetime else datetime + for attachments in scrape["attachments"].values(): + urls_found.extend(attachments) + + filenames = self.vks.download_media(vk_scrapes, item.get_tmp_dir()) + for filename in filenames: + result.add_media(Media(filename)) + + return result.success("vk") diff --git a/src/utils/misc.py b/src/utils/misc.py index 644c713..e7c5427 100644 --- a/src/utils/misc.py +++ b/src/utils/misc.py @@ -29,3 +29,14 @@ def getattr_or(o: object, prop: str, default=None): except: return default + +class DateTimeEncoder(json.JSONEncoder): + # to allow json.dump with datetimes do json.dumps(obj, cls=DateTimeEncoder) + def default(self, o): + if isinstance(o, datetime): + return str(o) # with timezone + return json.JSONEncoder.default(self, o) + + +def dump_payload(p): + return json.dumps(p, ensure_ascii=False, indent=4, cls=DateTimeEncoder) From 176ce7e8da2d4c853053bdfa6d38290af6999317 Mon Sep 17 00:00:00 2001 From: msramalho <19508417+msramalho@users.noreply.github.com> Date: Wed, 18 Jan 2023 21:37:29 +0000 Subject: [PATCH 043/190] vk cleanup --- src/archivers/vk_archiverv2.py | 17 ++--------------- 1 file changed, 2 insertions(+), 15 deletions(-) diff --git a/src/archivers/vk_archiverv2.py b/src/archivers/vk_archiverv2.py index 147424d..ec06808 100644 --- a/src/archivers/vk_archiverv2.py +++ b/src/archivers/vk_archiverv2.py @@ -1,7 +1,5 @@ -import re, json, mimetypes, os - from loguru import logger -from vk_url_scraper import VkScraper, DateTimeEncoder +from vk_url_scraper import VkScraper from metadata import Metadata from media import Media @@ -15,8 +13,6 @@ class VkArchiver(Archiverv2): Currently only works for /wall posts """ name = "vk_archiver" - wall_pattern = re.compile(r"(wall.{0,1}\d+_\d+)") - photo_pattern = re.compile(r"(photo.{0,1}\d+_\d+)") def __init__(self, config: dict) -> None: super().__init__(config) @@ -40,6 +36,7 @@ class VkArchiver(Archiverv2): # some urls can contain multiple wall/photo/... parts and all will be fetched vk_scrapes = self.vks.scrape(url) if not len(vk_scrapes): return False + logger.debug(f"VK: got {len(vk_scrapes)} scraped instances") result = Metadata() for scrape in vk_scrapes: @@ -50,16 +47,6 @@ class VkArchiver(Archiverv2): result.set_content(dump_payload(vk_scrapes)) - textual_output = "" - title, datetime = vk_scrapes[0]["text"], vk_scrapes[0]["datetime"] - urls_found = [] - for scrape in vk_scrapes: - textual_output += f"id: {scrape['id']}
    time utc: {scrape['datetime']}
    text: {scrape['text']}
    payload: {dump_payload(scrape['payload'])}


    " - title = scrape["text"] if len(title) == 0 else title - datetime = scrape["datetime"] if not datetime else datetime - for attachments in scrape["attachments"].values(): - urls_found.extend(attachments) - filenames = self.vks.download_media(vk_scrapes, item.get_tmp_dir()) for filename in filenames: result.add_media(Media(filename)) From 9bbc13e9be7c0a40d886e87443416b532cee616d Mon Sep 17 00:00:00 2001 From: msramalho <19508417+msramalho@users.noreply.github.com> Date: Wed, 18 Jan 2023 23:15:25 +0000 Subject: [PATCH 044/190] vk and yt-dlp --- src/archivers/__init__.py | 5 +- src/archivers/vk_archiverv2.py | 2 +- src/archivers/youtubedl_archiverv2.py | 70 +++++++++++++++++++++++++++ src/enrichers/thumbnail_enricher.py | 1 - 4 files changed, 74 insertions(+), 4 deletions(-) create mode 100644 src/archivers/youtubedl_archiverv2.py diff --git a/src/archivers/__init__.py b/src/archivers/__init__.py index d2a2c49..51d3546 100644 --- a/src/archivers/__init__.py +++ b/src/archivers/__init__.py @@ -5,7 +5,7 @@ from .archiver import Archiverv2 # from .telethon_archiver import TelethonArchiver # from .tiktok_archiver import TiktokArchiver from .wayback_archiver import WaybackArchiver -from .youtubedl_archiver import YoutubeDLArchiver +# from .youtubedl_archiver import YoutubeDLArchiver # from .twitter_archiver import TwitterArchiver # from .vk_archiver import VkArchiver # from .twitter_api_archiver import TwitterApiArchiver @@ -17,4 +17,5 @@ from .twitter_api_archiverv2 import TwitterApiArchiver from .instagram_archiverv2 import InstagramArchiver from .tiktok_archiverv2 import TiktokArchiver from .telegram_archiverv2 import TelegramArchiver -from .vk_archiverv2 import VkArchiver \ No newline at end of file +from .vk_archiverv2 import VkArchiver +from .youtubedl_archiverv2 import YoutubeDLArchiver \ No newline at end of file diff --git a/src/archivers/vk_archiverv2.py b/src/archivers/vk_archiverv2.py index ec06808..32b6cec 100644 --- a/src/archivers/vk_archiverv2.py +++ b/src/archivers/vk_archiverv2.py @@ -1,9 +1,9 @@ from loguru import logger from vk_url_scraper import VkScraper +from utils.misc import dump_payload from metadata import Metadata from media import Media -from utils.misc import dump_payload from .archiver import Archiverv2 diff --git a/src/archivers/youtubedl_archiverv2.py b/src/archivers/youtubedl_archiverv2.py new file mode 100644 index 0000000..6d26de6 --- /dev/null +++ b/src/archivers/youtubedl_archiverv2.py @@ -0,0 +1,70 @@ +import datetime +import os + +import yt_dlp +from loguru import logger + +from metadata import Metadata +from media import Media +from .archiver import Archiverv2 + + +class YoutubeDLArchiver(Archiverv2): + name = "youtubedl_enricher" + + def __init__(self, config: dict) -> None: + super().__init__(config) + + @staticmethod + def configs() -> dict: + return { + "facebook_cookie": {"default": None, "help": "optional facebook cookie to have more access to content, from browser, looks like 'cookie: datr= xxxx'"}, + } + + def download(self, item: Metadata) -> Metadata: + url = item.get_url() + + if item.netloc in ['facebook.com', 'www.facebook.com'] and self.facebook_cookie: + logger.debug('Using Facebook cookie') + yt_dlp.utils.std_headers['cookie'] = self.facebook_cookie + + ydl = yt_dlp.YoutubeDL({'outtmpl': os.path.join(item.get_tmp_dir(), f'%(id)s.%(ext)s'), 'quiet': False}) + + try: + # don'd download since it can be a live stream + info = ydl.extract_info(url, download=False) + if info.get('is_live', False): + logger.warning("Live streaming media, not archiving now") + return False + except yt_dlp.utils.DownloadError as e: + logger.debug(f'No video - Youtube normal control flow: {e}') + return False + except Exception as e: + logger.debug(f'ytdlp exception which is normal for example a facebook page with images only will cause a IndexError: list index out of range. Exception here is: \n {e}') + return False + + # this time download + info = ydl.extract_info(url, download=True) + if "entries" in info: + entries = info.get("entries", []) + if not len(entries): + logger.warning('YoutubeDLArchiver could not find any video') + return False + else: entries = [info] + + result = Metadata() + result.set_title(info.get("title")) + for entry in entries: + filename = ydl.prepare_filename(entry) + if not os.path.exists(filename): + filename = filename.split('.')[0] + '.mkv' + result.add_media(Media(filename).set("duration", info.get("duration"))) + + if (timestamp := info.get("timestamp")): + timestamp = datetime.datetime.utcfromtimestamp(timestamp).replace(tzinfo=datetime.timezone.utc).isoformat() + result.set_timestamp(timestamp) + if (upload_date := info.get("upload_date")): + upload_date = datetime.datetime.strptime(upload_date, '%Y%m%d').replace(tzinfo=datetime.timezone.utc) + result.set("upload_date", upload_date) + + return result.success("yt-dlp") diff --git a/src/enrichers/thumbnail_enricher.py b/src/enrichers/thumbnail_enricher.py index 94c5ee7..32e09be 100644 --- a/src/enrichers/thumbnail_enricher.py +++ b/src/enrichers/thumbnail_enricher.py @@ -25,7 +25,6 @@ class ThumbnailEnricher(Enricher): folder = os.path.join(to_enrich.get_tmp_dir(), str(uuid.uuid4())) os.makedirs(folder, exist_ok=True) for i, m in enumerate(to_enrich.media[::]): - logger.info(m) if m.is_video(): logger.debug(f"generating thumbnails for {m.filename}") fps, duration = 0.5, m.get("duration") From ea2c266fa272f4f911dd8a08084ff097c28875f0 Mon Sep 17 00:00:00 2001 From: msramalho <19508417+msramalho@users.noreply.github.com> Date: Thu, 19 Jan 2023 00:27:11 +0000 Subject: [PATCH 045/190] clean up and wacz WIP --- src/archivers/__init__.py | 4 +- src/archivers/archiver.py | 19 ++- src/archivers/instagram_archiver.py | 140 -------------------- src/archivers/telegram_archiver.py | 89 ------------- src/archivers/telethon_archiver.py | 125 ----------------- src/archivers/tiktok_archiver.py | 72 ---------- src/archivers/twitter_api_archiver.py | 75 ----------- src/archivers/twitter_archiver.py | 105 --------------- src/archivers/vk_archiver.py | 74 ----------- src/archivers/wayback_archiver.py | 89 ------------- src/archivers/youtubedl_archiver.py | 118 ----------------- src/configs/v2config.py | 9 +- src/enrichers/__init__.py | 5 +- src/enrichers/wacz_enricher.py | 70 ++++++++++ src/enrichers/wayback_enricher.py | 36 +++-- src/formatters/templates/html_template.html | 1 - src/formatters/templates/macros.html | 15 ++- src/media.py | 2 +- src/storages/s3.py | 3 +- 19 files changed, 141 insertions(+), 910 deletions(-) delete mode 100644 src/archivers/instagram_archiver.py delete mode 100644 src/archivers/telegram_archiver.py delete mode 100644 src/archivers/telethon_archiver.py delete mode 100644 src/archivers/tiktok_archiver.py delete mode 100644 src/archivers/twitter_api_archiver.py delete mode 100644 src/archivers/twitter_archiver.py delete mode 100644 src/archivers/vk_archiver.py delete mode 100644 src/archivers/wayback_archiver.py delete mode 100644 src/archivers/youtubedl_archiver.py create mode 100644 src/enrichers/wacz_enricher.py diff --git a/src/archivers/__init__.py b/src/archivers/__init__.py index 51d3546..22e142f 100644 --- a/src/archivers/__init__.py +++ b/src/archivers/__init__.py @@ -1,16 +1,16 @@ # we need to explicitly expose the available imports here from .base_archiver import Archiver, ArchiveResult -from .archiver import Archiverv2 # from .telegram_archiver import TelegramArchiver # from .telethon_archiver import TelethonArchiver # from .tiktok_archiver import TiktokArchiver -from .wayback_archiver import WaybackArchiver +# from .wayback_archiver import WaybackArchiver # from .youtubedl_archiver import YoutubeDLArchiver # from .twitter_archiver import TwitterArchiver # from .vk_archiver import VkArchiver # from .twitter_api_archiver import TwitterApiArchiver # from .instagram_archiver import InstagramArchiver +from .archiver import Archiverv2 from .telethon_archiverv2 import TelethonArchiver from .twitter_archiverv2 import TwitterArchiver from .twitter_api_archiverv2 import TwitterApiArchiver diff --git a/src/archivers/archiver.py b/src/archivers/archiver.py index 369dd60..7682e11 100644 --- a/src/archivers/archiver.py +++ b/src/archivers/archiver.py @@ -15,9 +15,8 @@ class Archiverv2(Step): # without this STEP.__init__ is not called super().__init__(config) - # only for typing... - def init(name: str, config: dict) -> Archiverv2: + # only for typing... return Step.init(name, config, Archiverv2) def setup(self) -> None: @@ -58,3 +57,19 @@ class Archiverv2(Step): @abstractmethod def download(self, item: Metadata) -> Metadata: pass + + # TODO: how to fix allow predictable key + # def get_key(self, filename): + # """ + # returns a key in the format "[archiverName]_[filename]" includes extension + # """ + # tail = os.path.split(filename)[1] # returns filename.ext from full path + # _id, extension = os.path.splitext(tail) # returns [filename, .ext] + # if 'unknown_video' in _id: + # _id = _id.replace('unknown_video', 'jpg') + + # # long filenames can cause problems, so trim them if necessary + # if len(_id) > 128: + # _id = _id[-128:] + + # return f'{self.name}_{_id}{extension}' \ No newline at end of file diff --git a/src/archivers/instagram_archiver.py b/src/archivers/instagram_archiver.py deleted file mode 100644 index 62db876..0000000 --- a/src/archivers/instagram_archiver.py +++ /dev/null @@ -1,140 +0,0 @@ -import re, os, shutil, html, traceback -import instaloader # https://instaloader.github.io/as-module.html -from loguru import logger - -from .base_archiver import Archiver, ArchiveResult -from configs import Config -from storages import Storage - - -class InstagramArchiver(Archiver): - """ - Uses Instaloader to download either a post (inc images, videos, text) or as much as possible from a profile (posts, stories, highlights, ) - """ - name = "instagram" - DOWNLOAD_FOLDER = "instaloader" - # NB: post should be tested before profile - # https://regex101.com/r/MGPquX/1 - post_pattern = re.compile(r"(?:(?:http|https):\/\/)?(?:www.)?(?:instagram.com|instagr.am|instagr.com)\/(?:p|reel)\/(\w+)") - # https://regex101.com/r/6Wbsxa/1 - profile_pattern = re.compile(r"(?:(?:http|https):\/\/)?(?:www.)?(?:instagram.com|instagr.am|instagr.com)\/(\w+)") - - def __init__(self, storage: Storage, config: Config): - super().__init__(storage, config) - self.insta = instaloader.Instaloader(download_geotags=True, download_comments=True, compress_json=False, dirname_pattern=self.DOWNLOAD_FOLDER, filename_pattern="{date_utc}_UTC_{target}__{typename}") - if config.instagram_config: - try: - self.insta.load_session_from_file(config.instagram_config.username, config.instagram_config.session_file) - except Exception as e: - logger.error(f"Unable to login from session file: {e}\n{traceback.format_exc()}") - try: - self.insta.login(config.instagram_config.username, config.instagram_config. - password) - #TODO: wait for this issue to be fixed https://github.com/instaloader/instaloader/issues/1758 - self.insta.save_session_to_file(config.instagram_config.session_file) - except Exception as e2: - logger.error(f"Unable to finish login (retrying from file): {e2}\n{traceback.format_exc()}") - - - - def download(self, url, check_if_exists=False): - post_matches = self.post_pattern.findall(url) - profile_matches = self.profile_pattern.findall(url) - - # return if not a valid instagram link - if not len(post_matches) and not len(profile_matches): - return - - # check if already uploaded - key = self.get_html_key(url) - if check_if_exists and self.storage.exists(key): - # only s3 storage supports storage.exists as not implemented on gd - cdn_url = self.storage.get_cdn_url(key) - screenshot = self.get_screenshot(url) - wacz = self.get_wacz(url) - return self.generateArchiveResult(status='already archived', cdn_url=cdn_url, screenshot=screenshot, wacz=wacz) - - try: - # process if post - if len(post_matches): - return self.download_post(url, post_matches[0]) - - # process if profile - if len(profile_matches): - return self.download_profile(url, profile_matches[0]) - finally: - shutil.rmtree(self.DOWNLOAD_FOLDER, ignore_errors=True) - - def download_post(self, url, post_id): - logger.debug(f"Instagram {post_id=} detected in {url=}") - - post = instaloader.Post.from_shortcode(self.insta.context, post_id) - if self.insta.download_post(post, target=post.owner_username): - return self.upload_downloaded_content(url, post.title, post._asdict(), post.date) - - def download_profile(self, url, username): - # gets posts, posts where username is tagged, igtv postss, stories, and highlights - logger.debug(f"Instagram {username=} detected in {url=}") - - profile = instaloader.Profile.from_username(self.insta.context, username) - try: - for post in profile.get_posts(): - try: self.insta.download_post(post, target=f"profile_post_{post.owner_username}") - except Exception as e: logger.error(f"Failed to download post: {post.shortcode}: {e}") - except Exception as e: logger.error(f"Failed profile.get_posts: {e}") - - try: - for post in profile.get_tagged_posts(): - try: self.insta.download_post(post, target=f"tagged_post_{post.owner_username}") - except Exception as e: logger.error(f"Failed to download tagged post: {post.shortcode}: {e}") - except Exception as e: logger.error(f"Failed profile.get_tagged_posts: {e}") - - try: - for post in profile.get_igtv_posts(): - try: self.insta.download_post(post, target=f"igtv_post_{post.owner_username}") - except Exception as e: logger.error(f"Failed to download igtv post: {post.shortcode}: {e}") - except Exception as e: logger.error(f"Failed profile.get_igtv_posts: {e}") - - try: - for story in self.insta.get_stories([profile.userid]): - for item in story.get_items(): - try: self.insta.download_storyitem(item, target=f"story_item_{story.owner_username}") - except Exception as e: logger.error(f"Failed to download story item: {item}: {e}") - except Exception as e: logger.error(f"Failed get_stories: {e}") - - try: - for highlight in self.insta.get_highlights(profile.userid): - for item in highlight.get_items(): - try: self.insta.download_storyitem(item, target=f"highlight_item_{highlight.owner_username}") - except Exception as e: logger.error(f"Failed to download highlight item: {item}: {e}") - except Exception as e: logger.error(f"Failed get_highlights: {e}") - - return self.upload_downloaded_content(url, f"@{username}", profile._asdict(), None) - - def upload_downloaded_content(self, url, title, content, date): - status = "success" - try: - uploaded_media = [] - for f in os.listdir(self.DOWNLOAD_FOLDER): - if os.path.isfile((filename := os.path.join(self.DOWNLOAD_FOLDER, f))): - key = self.get_key(filename) - self.storage.upload(filename, key) - hash = self.get_hash(filename) - cdn_url = self.storage.get_cdn_url(key) - uploaded_media.append({'cdn_url': cdn_url, 'key': key, 'hash': hash}) - assert len(uploaded_media) > 1, "No uploaded media found" - - uploaded_media.sort(key=lambda m:m["key"], reverse=True) - - page_cdn, page_hash, _ = self.generate_media_page_html(url, uploaded_media, html.escape(str(content))) - except Exception as e: - logger.error(f"Could not fetch instagram post {url} due to: {e}") - status = "error" - finally: - shutil.rmtree(self.DOWNLOAD_FOLDER, ignore_errors=True) - - if status == "success": - screenshot = self.get_screenshot(url) - wacz = self.get_wacz(url) - - return self.generateArchiveResult(status=status, cdn_url=page_cdn, title=title, timestamp=date, hash=page_hash, screenshot=screenshot, wacz=wacz) diff --git a/src/archivers/telegram_archiver.py b/src/archivers/telegram_archiver.py deleted file mode 100644 index c6d8747..0000000 --- a/src/archivers/telegram_archiver.py +++ /dev/null @@ -1,89 +0,0 @@ -import os, requests, re - -import html -from bs4 import BeautifulSoup -from loguru import logger - -from .base_archiver import Archiver, ArchiveResult -from storages import Storage - - -class TelegramArchiver(Archiver): - name = "telegram" - - def download(self, url, check_if_exists=False): - # detect URLs that we definitely cannot handle - if 't.me' != self.get_netloc(url): - return False - - headers = { - 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36' - } - status = "success" - - original_url = url - - # TODO: check if we can do this more resilient to variable URLs - if url[-8:] != "?embed=1": - url += "?embed=1" - - screenshot = self.get_screenshot(url) - wacz = self.get_wacz(url) - - t = requests.get(url, headers=headers) - s = BeautifulSoup(t.content, 'html.parser') - video = s.find("video") - - if video is None: - logger.warning("could not find video") - image_tags = s.find_all(class_="js-message_photo") - - images = [] - for im in image_tags: - urls = [u.replace("'", "") for u in re.findall(r'url\((.*?)\)', im['style'])] - images += urls - - page_cdn, page_hash, thumbnail = self.generate_media_page(images, url, html.escape(str(t.content))) - time_elements = s.find_all('time') - timestamp = time_elements[0].get('datetime') if len(time_elements) else None - - return self.generateArchiveResult(status="success", cdn_url=page_cdn, screenshot=screenshot, hash=page_hash, thumbnail=thumbnail, timestamp=timestamp, wacz=wacz) - - video_url = video.get('src') - video_id = video_url.split('/')[-1].split('?')[0] - key = self.get_key(video_id) - - filename = os.path.join(Storage.TMP_FOLDER, key) - - if check_if_exists and self.storage.exists(key): - status = 'already archived' - - v = requests.get(video_url, headers=headers) - - with open(filename, 'wb') as f: - f.write(v.content) - - if status != 'already archived': - self.storage.upload(filename, key) - - hash = self.get_hash(filename) - - # extract duration from HTML - try: - duration = s.find_all('time')[0].contents[0] - if ':' in duration: - duration = float(duration.split( - ':')[0]) * 60 + float(duration.split(':')[1]) - else: - duration = float(duration) - except: - duration = "" - - # process thumbnails - key_thumb, thumb_index = self.get_thumbnails( - filename, key, duration=duration) - os.remove(filename) - - cdn_url = self.storage.get_cdn_url(key) - return self.generateArchiveResult(status=status, cdn_url=cdn_url, thumbnail=key_thumb, thumbnail_index=thumb_index, - duration=duration, title=original_url, timestamp=s.find_all('time')[1].get('datetime'), hash=hash, screenshot=screenshot, wacz=wacz) diff --git a/src/archivers/telethon_archiver.py b/src/archivers/telethon_archiver.py deleted file mode 100644 index a2cbf0a..0000000 --- a/src/archivers/telethon_archiver.py +++ /dev/null @@ -1,125 +0,0 @@ -import os, re, html -from loguru import logger -from telethon.sync import TelegramClient -from telethon.errors import ChannelInvalidError - -from storages import Storage -from .base_archiver import Archiver, ArchiveResult -from configs import Config -from utils import getattr_or - - -class TelethonArchiver(Archiver): - name = "telethon" - link_pattern = re.compile(r"https:\/\/t\.me(\/c){0,1}\/(.+)\/(\d+)") - - def __init__(self, storage: Storage, config: Config): - super().__init__(storage, config) - if config.telegram_config: - c = config.telegram_config - self.client = TelegramClient("./anon.session", c.api_id, c.api_hash) - self.bot_token = c.bot_token - - def _get_media_posts_in_group(self, chat, original_post, max_amp=10): - """ - Searches for Telegram posts that are part of the same group of uploads - The search is conducted around the id of the original post with an amplitude - of `max_amp` both ways - Returns a list of [post] where each post has media and is in the same grouped_id - """ - if getattr_or(original_post, "grouped_id") is None: - return [original_post] if getattr_or(original_post, "media") else [] - - search_ids = [i for i in range(original_post.id - max_amp, original_post.id + max_amp + 1)] - posts = self.client.get_messages(chat, ids=search_ids) - media = [] - for post in posts: - if post is not None and post.grouped_id == original_post.grouped_id and post.media is not None: - media.append(post) - return media - - def download(self, url, check_if_exists=False): - if not hasattr(self, "client"): - logger.warning('Missing Telethon config') - return False - - # detect URLs that we definitely cannot handle - matches = self.link_pattern.findall(url) - if not len(matches): - return False - - status = "success" - - # app will ask (stall for user input!) for phone number and auth code if anon.session not found - with self.client.start(bot_token=self.bot_token): - matches = list(matches[0]) - chat, post_id = matches[1], matches[2] - - post_id = int(post_id) - - try: - post = self.client.get_messages(chat, ids=post_id) - except ValueError as e: - logger.error(f"Could not fetch telegram {url} possibly it's private: {e}") - return False - except ChannelInvalidError as e: - logger.error(f"Could not fetch telegram {url}. This error can be fixed if you setup a bot_token in addition to api_id and api_hash: {e}") - return False - - if post is None: return False - - media_posts = self._get_media_posts_in_group(chat, post) - logger.debug(f'got {len(media_posts)=} for {url=}') - - screenshot = self.get_screenshot(url) - wacz = self.get_wacz(url) - - if len(media_posts) > 0: - key = self.get_html_key(url) - - if check_if_exists and self.storage.exists(key): - # only s3 storage supports storage.exists as not implemented on gd - cdn_url = self.storage.get_cdn_url(key) - return self.generateArchiveResult(status='already archived', cdn_url=cdn_url, title=post.message, timestamp=post.date, screenshot=screenshot, wacz=wacz) - - key_thumb, thumb_index = None, None - group_id = post.grouped_id if post.grouped_id is not None else post.id - uploaded_media = [] - message = post.message - for mp in media_posts: - if len(mp.message) > len(message): message = mp.message - - # media can also be in entities - if mp.entities: - other_media_urls = [e.url for e in mp.entities if hasattr(e, "url") and e.url and self._guess_file_type(e.url) in ["video", "image"]] - logger.debug(f"Got {len(other_media_urls)} other medial urls from {mp.id=}: {other_media_urls}") - for om_url in other_media_urls: - filename = os.path.join(Storage.TMP_FOLDER, f'{chat}_{group_id}_{self._get_key_from_url(om_url)}') - self.download_from_url(om_url, filename) - key = filename.split(Storage.TMP_FOLDER)[1] - self.storage.upload(filename, key) - hash = self.get_hash(filename) - cdn_url = self.storage.get_cdn_url(key) - uploaded_media.append({'cdn_url': cdn_url, 'key': key, 'hash': hash}) - - filename_dest = os.path.join(Storage.TMP_FOLDER, f'{chat}_{group_id}', str(mp.id)) - filename = self.client.download_media(mp.media, filename_dest) - if not filename: - logger.debug(f"Empty media found, skipping {str(mp)=}") - continue - - key = filename.split(Storage.TMP_FOLDER)[1] - self.storage.upload(filename, key) - hash = self.get_hash(filename) - cdn_url = self.storage.get_cdn_url(key) - uploaded_media.append({'cdn_url': cdn_url, 'key': key, 'hash': hash}) - if key_thumb is None: - key_thumb, thumb_index = self.get_thumbnails(filename, key) - os.remove(filename) - - page_cdn, page_hash, _ = self.generate_media_page_html(url, uploaded_media, html.escape(str(post))) - - return self.generateArchiveResult(status=status, cdn_url=page_cdn, title=message, timestamp=post.date, hash=page_hash, screenshot=screenshot, thumbnail=key_thumb, thumbnail_index=thumb_index, wacz=wacz) - - page_cdn, page_hash, _ = self.generate_media_page_html(url, [], html.escape(str(post))) - return self.generateArchiveResult(status=status, cdn_url=page_cdn, title=post.message, timestamp=getattr_or(post, "date"), hash=page_hash, screenshot=screenshot, wacz=wacz) diff --git a/src/archivers/tiktok_archiver.py b/src/archivers/tiktok_archiver.py deleted file mode 100644 index 55cb97e..0000000 --- a/src/archivers/tiktok_archiver.py +++ /dev/null @@ -1,72 +0,0 @@ -import os, traceback -import tiktok_downloader -from loguru import logger - -from .base_archiver import Archiver, ArchiveResult -from storages import Storage - - -class TiktokArchiver(Archiver): - name = "tiktok" - - def download(self, url, check_if_exists=False): - if 'tiktok.com' not in url: - return False - - status = 'success' - - try: - info = tiktok_downloader.info_post(url) - key = self.get_key(f'{info.id}.mp4') - filename = os.path.join(Storage.TMP_FOLDER, key) - logger.info(f'found video {key=}') - - if check_if_exists and self.storage.exists(key): - status = 'already archived' - - media = tiktok_downloader.snaptik(url).get_media() - - if len(media) <= 0: - if status == 'already archived': - return self.generateArchiveResult(status='Could not download media, but already archived', cdn_url=self.storage.get_cdn_url(key)) - else: - return self.generateArchiveResult(status='Could not download media') - - logger.info(f'downloading video {key=}') - media[0].download(filename) - - if status != 'already archived': - logger.info(f'uploading video {key=}') - self.storage.upload(filename, key) - - try: - key_thumb, thumb_index = self.get_thumbnails(filename, key, duration=info.duration) - except Exception as e: - logger.error(e) - key_thumb = '' - thumb_index = 'error creating thumbnails' - - hash = self.get_hash(filename) - screenshot = self.get_screenshot(url) - wacz = self.get_wacz(url) - - try: os.remove(filename) - except FileNotFoundError: - logger.info(f'tmp file not found thus not deleted {filename}') - cdn_url = self.storage.get_cdn_url(key) - timestamp = info.create.isoformat() if hasattr(info, "create") else None - - return self.generateArchiveResult(status=status, cdn_url=cdn_url, thumbnail=key_thumb, - thumbnail_index=thumb_index, duration=getattr(info, "duration", 0), title=getattr(info, "caption", ""), - timestamp=timestamp, hash=hash, screenshot=screenshot, wacz=wacz) - - except tiktok_downloader.Except.InvalidUrl as e: - status = 'Invalid URL' - logger.warning(f'Invalid URL on {url} {e}\n{traceback.format_exc()}') - return self.generateArchiveResult(status=status) - - except: - error = traceback.format_exc() - status = 'Other Tiktok error: ' + str(error) - logger.warning(f'Other Tiktok error' + str(error)) - return self.generateArchiveResult(status=status) diff --git a/src/archivers/twitter_api_archiver.py b/src/archivers/twitter_api_archiver.py deleted file mode 100644 index da56d31..0000000 --- a/src/archivers/twitter_api_archiver.py +++ /dev/null @@ -1,75 +0,0 @@ - -import json -from datetime import datetime -from loguru import logger -from pytwitter import Api - -from storages.base_storage import Storage -from configs import Config -from .base_archiver import ArchiveResult -from .twitter_archiver import TwitterArchiver - - -class TwitterApiArchiver(TwitterArchiver): - name = "twitter_api" - - def __init__(self, storage: Storage, config: Config): - super().__init__(storage, config) - c = config.twitter_config - - if c.bearer_token: - self.api = Api(bearer_token=c.bearer_token) - elif c.consumer_key and c.consumer_secret and c.access_token and c.access_secret: - self.api = Api( - consumer_key=c.consumer_key, consumer_secret=c.consumer_secret, access_token=c.access_token, access_secret=c.access_secret) - - def download(self, url, check_if_exists=False): - if not hasattr(self, "api"): - logger.warning('Missing Twitter API config') - return False - - username, tweet_id = self.get_username_tweet_id(url) - if not username: return False - - tweet = self.api.get_tweet(tweet_id, expansions=["attachments.media_keys"], media_fields=["type", "duration_ms", "url", "variants"], tweet_fields=["attachments", "author_id", "created_at", "entities", "id", "text", "possibly_sensitive"]) - timestamp = datetime.strptime(tweet.data.created_at, "%Y-%m-%dT%H:%M:%S.%fZ") - - # check if exists - key = self.get_html_key(url) - if check_if_exists and self.storage.exists(key): - # only s3 storage supports storage.exists as not implemented on gd - cdn_url = self.storage.get_cdn_url(key) - screenshot = self.get_screenshot(url) - return self.generateArchiveResult(status='already archived', cdn_url=cdn_url, title=tweet.data.text, timestamp=timestamp, screenshot=screenshot) - - urls = [] - if tweet.includes: - for m in tweet.includes.media: - if m.url: - urls.append(m.url) - elif hasattr(m, "variants"): - var_url = self.choose_variant(m.variants) - urls.append(var_url) - else: - urls.append(None) # will trigger error - - for u in urls: - if u is None: - logger.debug(f"Should not have gotten None url for {tweet.includes.media=} so going to download_alternative in twitter_archiver") - return self.download_alternative(url, tweet_id) - logger.debug(f"found {urls=}") - - output = json.dumps({ - "id": tweet.data.id, - "text": tweet.data.text, - "created_at": tweet.data.created_at, - "author_id": tweet.data.author_id, - "geo": tweet.data.geo, - "lang": tweet.data.lang, - "media": urls - }, ensure_ascii=False, indent=4) - - screenshot = self.get_screenshot(url) - wacz = self.get_wacz(url) - page_cdn, page_hash, thumbnail = self.generate_media_page(urls, url, output) - return self.generateArchiveResult(status="success", cdn_url=page_cdn, screenshot=screenshot, hash=page_hash, thumbnail=thumbnail, timestamp=timestamp, title=tweet.data.text, wacz=wacz) diff --git a/src/archivers/twitter_archiver.py b/src/archivers/twitter_archiver.py deleted file mode 100644 index f1f22c0..0000000 --- a/src/archivers/twitter_archiver.py +++ /dev/null @@ -1,105 +0,0 @@ -import html, re, requests -from datetime import datetime -from loguru import logger -from snscrape.modules.twitter import TwitterTweetScraper, Video, Gif, Photo - -from .base_archiver import Archiver, ArchiveResult - -class TwitterArchiver(Archiver): - """ - This Twitter Archiver uses unofficial scraping methods, and it works as - an alternative to TwitterApiArchiver when no API credentials are provided. - """ - - name = "twitter" - link_pattern = re.compile(r"twitter.com\/(?:\#!\/)?(\w+)\/status(?:es)?\/(\d+)") - - def get_username_tweet_id(self, url): - # detect URLs that we definitely cannot handle - matches = self.link_pattern.findall(url) - if not len(matches): return False, False - - username, tweet_id = matches[0] # only one URL supported - logger.debug(f"Found {username=} and {tweet_id=} in {url=}") - - return username, tweet_id - - def download(self, url, check_if_exists=False): - username, tweet_id = self.get_username_tweet_id(url) - if not username: return False - - scr = TwitterTweetScraper(tweet_id) - - try: - tweet = next(scr.get_items()) - except Exception as ex: - logger.warning(f"can't get tweet: {type(ex).__name__} occurred. args: {ex.args}") - return self.download_alternative(url, tweet_id) - - if tweet.media is None: - logger.debug(f'No media found, archiving tweet text only') - screenshot = self.get_screenshot(url) - wacz = self.get_wacz(url) - page_cdn, page_hash, _ = self.generate_media_page_html(url, [], html.escape(tweet.json())) - return self.generateArchiveResult(status="success", cdn_url=page_cdn, title=tweet.content, timestamp=tweet.date, hash=page_hash, screenshot=screenshot, wacz=wacz) - - urls = [] - - for media in tweet.media: - if type(media) == Video: - variant = max( - [v for v in media.variants if v.bitrate], key=lambda v: v.bitrate) - urls.append(variant.url) - elif type(media) == Gif: - urls.append(media.variants[0].url) - elif type(media) == Photo: - urls.append(media.fullUrl.replace('name=large', 'name=orig')) - else: - logger.warning(f"Could not get media URL of {media}") - - page_cdn, page_hash, thumbnail = self.generate_media_page(urls, url, tweet.json()) - - screenshot = self.get_screenshot(url) - wacz = self.get_wacz(url) - - return self.generateArchiveResult(status="success", cdn_url=page_cdn, screenshot=screenshot, hash=page_hash, thumbnail=thumbnail, timestamp=tweet.date, title=tweet.content, wacz=wacz) - - def download_alternative(self, url, tweet_id): - # https://stackoverflow.com/a/71867055/6196010 - logger.debug(f"Trying twitter hack for {url=}") - hack_url = f"https://cdn.syndication.twimg.com/tweet?id={tweet_id}" - r = requests.get(hack_url) - if r.status_code != 200: return False - tweet = r.json() - - urls = [] - for p in tweet["photos"]: - urls.append(p["url"]) - - # 1 tweet has 1 video max - if "video" in tweet: - v = tweet["video"] - urls.append(self.choose_variant(v.get("variants", []))) - - logger.debug(f"Twitter hack got {urls=}") - - timestamp = datetime.strptime(tweet["created_at"], "%Y-%m-%dT%H:%M:%S.%fZ") - screenshot = self.get_screenshot(url) - wacz = self.get_wacz(url) - page_cdn, page_hash, thumbnail = self.generate_media_page(urls, url, r.text) - return self.generateArchiveResult(status="success", cdn_url=page_cdn, screenshot=screenshot, hash=page_hash, thumbnail=thumbnail, timestamp=timestamp, title=tweet["text"], wacz=wacz) - - def choose_variant(self, variants): - # choosing the highest quality possible - variant, width, height = None, 0, 0 - for var in variants: - if var["type"] == "video/mp4": - width_height = re.search(r"\/(\d+)x(\d+)\/", var["src"]) - if width_height: - w, h = int(width_height[1]), int(width_height[2]) - if w > width or h > height: - width, height = w, h - variant = var.get("src", variant) - else: - variant = var.get("src") if not variant else variant - return variant diff --git a/src/archivers/vk_archiver.py b/src/archivers/vk_archiver.py deleted file mode 100644 index 1d38fa9..0000000 --- a/src/archivers/vk_archiver.py +++ /dev/null @@ -1,74 +0,0 @@ -import re, json, mimetypes, os - -from loguru import logger -from vk_url_scraper import VkScraper, DateTimeEncoder - -from storages import Storage -from .base_archiver import Archiver, ArchiveResult -from configs import Config - - -class VkArchiver(Archiver): - """" - VK videos are handled by YTDownloader, this archiver gets posts text and images. - Currently only works for /wall posts - """ - name = "vk" - wall_pattern = re.compile(r"(wall.{0,1}\d+_\d+)") - photo_pattern = re.compile(r"(photo.{0,1}\d+_\d+)") - - def __init__(self, storage: Storage, config: Config): - super().__init__(storage, config) - if config.vk_config != None: - self.vks = VkScraper(config.vk_config.username, config.vk_config.password) - - def download(self, url, check_if_exists=False): - if not hasattr(self, "vks") or self.vks is None: - logger.debug("VK archiver was not supplied with credentials.") - return False - - key = self.get_html_key(url) - # if check_if_exists and self.storage.exists(key): - # screenshot = self.get_screenshot(url) - # cdn_url = self.storage.get_cdn_url(key) - # return self.generateArchiveResult(status="already archived", cdn_url=cdn_url, screenshot=screenshot) - - results = self.vks.scrape(url) # some urls can contain multiple wall/photo/... parts and all will be fetched - if len(results) == 0: - return False - - def dump_payload(p): return json.dumps(p, ensure_ascii=False, indent=4, cls=DateTimeEncoder) - textual_output = "" - title, datetime = results[0]["text"], results[0]["datetime"] - urls_found = [] - for res in results: - textual_output += f"id: {res['id']}
    time utc: {res['datetime']}
    text: {res['text']}
    payload: {dump_payload(res['payload'])}


    " - title = res["text"] if len(title) == 0 else title - datetime = res["datetime"] if not datetime else datetime - for attachments in res["attachments"].values(): - urls_found.extend(attachments) - - # we don't call generate_media_page which downloads urls because it cannot download vk video urls - thumbnail, thumbnail_index = None, None - uploaded_media = [] - filenames = self.vks.download_media(results, Storage.TMP_FOLDER) - for filename in filenames: - key = self.get_key(filename) - self.storage.upload(filename, key) - hash = self.get_hash(filename) - cdn_url = self.storage.get_cdn_url(key) - try: - _type = mimetypes.guess_type(filename)[0].split("/")[0] - if _type == "image" and thumbnail is None: - thumbnail = cdn_url - if _type == "video" and (thumbnail is None or thumbnail_index is None): - thumbnail, thumbnail_index = self.get_thumbnails(filename, key) - except Exception as e: - logger.warning(f"failed to get thumb for {filename=} with {e=}") - uploaded_media.append({'cdn_url': cdn_url, 'key': key, 'hash': hash}) - - page_cdn, page_hash, thumbnail = self.generate_media_page_html(url, uploaded_media, textual_output, thumbnail=thumbnail) - # # if multiple wall/photos/videos are present the screenshot will only grab the 1st - screenshot = self.get_screenshot(url) - wacz = self.get_wacz(url) - return self.generateArchiveResult(status="success", cdn_url=page_cdn, screenshot=screenshot, hash=page_hash, thumbnail=thumbnail, thumbnail_index=thumbnail_index, timestamp=datetime, title=title, wacz=wacz) diff --git a/src/archivers/wayback_archiver.py b/src/archivers/wayback_archiver.py deleted file mode 100644 index 1bfa78a..0000000 --- a/src/archivers/wayback_archiver.py +++ /dev/null @@ -1,89 +0,0 @@ -import time, requests - -from loguru import logger -from bs4 import BeautifulSoup - -from storages import Storage -from .base_archiver import Archiver, ArchiveResult -from configs import Config - - -class WaybackArchiver(Archiver): - """ - This archiver could implement a check_if_exists by going to "https://web.archive.org/web/{url}" - but that might not be desirable since the webpage might have been archived a long time ago and thus have changed - """ - name = "wayback" - - def __init__(self, storage: Storage, config: Config): - super(WaybackArchiver, self).__init__(storage, config) - self.config = config.wayback_config - self.seen_urls = {} - - def download(self, url, check_if_exists=False): - if self.config is None: - logger.error('Missing Wayback config') - return False - if check_if_exists: - if url in self.seen_urls: return self.seen_urls[url] - - screenshot = self.get_screenshot(url) - wacz = self.get_wacz(url) - - logger.debug(f"POSTing {url=} to web.archive.org") - ia_headers = { - "Accept": "application/json", - "Authorization": f"LOW {self.config.key}:{self.config.secret}" - } - r = requests.post('https://web.archive.org/save/', headers=ia_headers, data={'url': url}) - - if r.status_code != 200: - logger.warning(f"Internet archive failed with status of {r.status_code}") - return self.generateArchiveResult(status="Internet archive failed", screenshot=screenshot, wacz=wacz) - - if 'job_id' not in r.json() and 'message' in r.json(): - return self.custom_retry(r.json(), screenshot=screenshot, wacz=wacz) - - job_id = r.json()['job_id'] - logger.debug(f"GETting status for {job_id=} on {url=}") - status_r = requests.get(f'https://web.archive.org/save/status/{job_id}', headers=ia_headers) - retries = 0 - - # TODO: make the job queue parallel -> consider propagation of results back to sheet though - # wait 90-120 seconds for the archive job to finish - while (status_r.status_code != 200 or status_r.json()['status'] == 'pending') and retries < 30: - time.sleep(3) - try: - logger.debug(f"GETting status for {job_id=} on {url=} [{retries=}]") - status_r = requests.get(f'https://web.archive.org/save/status/{job_id}', headers=ia_headers) - except: - time.sleep(1) - retries += 1 - - if status_r.status_code != 200: - return self.generateArchiveResult(status=f"Internet archive failed: check https://web.archive.org/save/status/{job_id}", screenshot=screenshot, wacz=wacz) - - status_json = status_r.json() - if status_json['status'] != 'success': - return self.custom_retry(status_json, screenshot=screenshot, wacz=wacz) - - archive_url = f"https://web.archive.org/web/{status_json['timestamp']}/{status_json['original_url']}" - - try: - req = requests.get(archive_url) - parsed = BeautifulSoup(req.content, 'html.parser') - title = parsed.find_all('title')[0].text - if title == 'Wayback Machine': - title = 'Could not get title' - except: - title = "Could not get title" - self.seen_urls[url] = self.generateArchiveResult(status='success', cdn_url=archive_url, title=title, screenshot=screenshot, wacz=wacz) - return self.seen_urls[url] - - def custom_retry(self, json_data, **kwargs): - logger.warning(f"Internet archive failed json \n {json_data}") - if "please try again" in str(json_data).lower(): - return self.signal_retry_in(**kwargs) - if "this host has been already captured" in str(json_data).lower(): - return self.signal_retry_in(**kwargs, min_seconds=86400, max_seconds=129600) # 24h to 36h later - return self.generateArchiveResult(status=f"Internet archive failed: {json_data}", **kwargs) diff --git a/src/archivers/youtubedl_archiver.py b/src/archivers/youtubedl_archiver.py deleted file mode 100644 index e2f27a2..0000000 --- a/src/archivers/youtubedl_archiver.py +++ /dev/null @@ -1,118 +0,0 @@ - -import os, datetime - -import yt_dlp -from loguru import logger - -from .base_archiver import Archiver, ArchiveResult -from storages import Storage -from configs import Config - - -class YoutubeDLArchiver(Archiver): - name = "youtube_dl" - ydl_opts = {'outtmpl': f'{Storage.TMP_FOLDER}%(id)s.%(ext)s', 'quiet': False} - - def __init__(self, storage: Storage, config: Config): - super().__init__(storage, config) - self.fb_cookie = config.facebook_cookie - - def download(self, url, check_if_exists=False): - netloc = self.get_netloc(url) - if netloc in ['facebook.com', 'www.facebook.com'] and self.fb_cookie: - logger.debug('Using Facebook cookie') - yt_dlp.utils.std_headers['cookie'] = self.fb_cookie - - ydl = yt_dlp.YoutubeDL(YoutubeDLArchiver.ydl_opts) - cdn_url = None - status = 'success' - - try: - info = ydl.extract_info(url, download=False) - except yt_dlp.utils.DownloadError as e: - logger.debug(f'No video - Youtube normal control flow: {e}') - return False - except Exception as e: - logger.debug(f'ytdlp exception which is normal for example a facebook page with images only will cause a IndexError: list index out of range. Exception here is: \n {e}') - return False - - if info.get('is_live', False): - logger.warning("Live streaming media, not archiving now") - return self.generateArchiveResult(status="Streaming media") - - if 'twitter.com' in netloc: - if 'https://twitter.com/' in info['webpage_url']: - logger.info('Found https://twitter.com/ in the download url from Twitter') - else: - logger.info('Found a linked video probably in a link in a tweet - not getting that video as there may be images in the tweet') - return False - - if check_if_exists: - if 'entries' in info: - if len(info['entries']) > 1: - logger.warning('YoutubeDLArchiver succeeded but cannot archive channels or pages with multiple videos') - return False - elif len(info['entries']) == 0: - logger.warning( - 'YoutubeDLArchiver succeeded but did not find video') - return False - - filename = ydl.prepare_filename(info['entries'][0]) - else: - filename = ydl.prepare_filename(info) - - key = self.get_key(filename) - - if self.storage.exists(key): - status = 'already archived' - cdn_url = self.storage.get_cdn_url(key) - - # sometimes this results in a different filename, so do this again - info = ydl.extract_info(url, download=True) - - # TODO: add support for multiple videos - if 'entries' in info: - if len(info['entries']) > 1: - logger.warning( - 'YoutubeDLArchiver cannot archive channels or pages with multiple videos') - return False - else: - info = info['entries'][0] - - filename = ydl.prepare_filename(info) - - if not os.path.exists(filename): - filename = filename.split('.')[0] + '.mkv' - - if status != 'already archived': - key = self.get_key(filename) - self.storage.upload(filename, key) - - # filename ='tmp/sDE-qZdi8p8.webm' - # key ='SM0022/youtube_dl_sDE-qZdi8p8.webm' - cdn_url = self.storage.get_cdn_url(key) - - hash = self.get_hash(filename) - screenshot = self.get_screenshot(url) - wacz = self.get_wacz(url) - - # get duration - duration = info.get('duration') - - # get thumbnails - try: - key_thumb, thumb_index = self.get_thumbnails(filename, key, duration=duration) - except: - key_thumb = '' - thumb_index = 'Could not generate thumbnails' - - os.remove(filename) - - timestamp = None - if 'timestamp' in info and info['timestamp'] is not None: - timestamp = datetime.datetime.utcfromtimestamp(info['timestamp']).replace(tzinfo=datetime.timezone.utc).isoformat() - elif 'upload_date' in info and info['upload_date'] is not None: - timestamp = datetime.datetime.strptime(info['upload_date'], '%Y%m%d').replace(tzinfo=datetime.timezone.utc) - - return self.generateArchiveResult(status=status, cdn_url=cdn_url, thumbnail=key_thumb, thumbnail_index=thumb_index, duration=duration, - title=info['title'] if 'title' in info else None, timestamp=timestamp, hash=hash, screenshot=screenshot, wacz=wacz) diff --git a/src/configs/v2config.py b/src/configs/v2config.py index dec3565..7b0820d 100644 --- a/src/configs/v2config.py +++ b/src/configs/v2config.py @@ -57,7 +57,12 @@ class ConfigV2: assert "." not in child.name, f"class prop name cannot contain dots('.'): {child.name}" assert "." not in config, f"config property cannot contain dots('.'): {config}" config_path = f"{child.name}.{config}" - parser.add_argument(f'--{config_path}', action='store', dest=config_path, help=f"{details['help']} (defaults to {details['default']})", choices=details.get("choices", None)) + try: + parser.add_argument(f'--{config_path}', action='store', dest=config_path, help=f"{details['help']} (defaults to {details['default']})", choices=details.get("choices", None)) + except argparse.ArgumentError: + # captures cases when a Step is used in 2 flows, eg: wayback enricher vs wayback archiver + pass + self.defaults[config_path] = details["default"] if "cli_set" in details: self.cli_ops[config_path] = details["cli_set"] @@ -92,7 +97,7 @@ class ConfigV2: self.feeder = Feeder.init(steps.get("feeder", "cli_feeder"), self.config) self.formatter = Formatter.init(steps.get("formatter", "html_formatter"), self.config) self.enrichers = [Enricher.init(e, self.config) for e in steps.get("enrichers", [])] - self.archivers = [Archiverv2.init(e, self.config) for e in steps.get("archivers", [])] + self.archivers = [Archiverv2.init(e, self.config) for e in (steps.get("archivers") or [])] self.databases = [Database.init(e, self.config) for e in steps.get("databases", [])] self.storages = [StorageV2.init(e, self.config) for e in steps.get("storages", [])] diff --git a/src/enrichers/__init__.py b/src/enrichers/__init__.py index 8b9220b..fe9cc68 100644 --- a/src/enrichers/__init__.py +++ b/src/enrichers/__init__.py @@ -1,5 +1,6 @@ from .enricher import Enricher from .screenshot_enricher import ScreenshotEnricher -from .wayback_enricher import WaybackEnricher +from .wayback_enricher import WaybackArchiverEnricher from .hash_enricher import HashEnricher -from .thumbnail_enricher import ThumbnailEnricher \ No newline at end of file +from .thumbnail_enricher import ThumbnailEnricher +from .wacz_enricher import WaczEnricher \ No newline at end of file diff --git a/src/enrichers/wacz_enricher.py b/src/enrichers/wacz_enricher.py new file mode 100644 index 0000000..1fa3191 --- /dev/null +++ b/src/enrichers/wacz_enricher.py @@ -0,0 +1,70 @@ +import os +import shutil +import subprocess +import uuid +from archivers.archiver import Archiverv2 +from media import Media +from . import Enricher +from metadata import Metadata +from loguru import logger +import time, requests + + +class WaczEnricher(Enricher): + """ + Submits the current URL to the webarchive and returns a job_id or completed archive + """ + name = "wacz_enricher" + + def __init__(self, config: dict) -> None: + # without this STEP.__init__ is not called + super().__init__(config) + + @staticmethod + def configs() -> dict: + return { + "profile": {"default": None, "help": "browsertrix-profile (for profile generation see https://github.com/webrecorder/browsertrix-crawler#creating-and-using-browser-profiles)."}, + "timeout": {"default": 90, "help": "timeout for WACZ generation in seconds"}, + } + + def enrich(self, to_enrich: Metadata) -> bool: + # TODO: figure out support for browsertrix in docker + url = to_enrich.get_url() + logger.debug(f"generating WACZ for {url=}") + collection = str(uuid.uuid4())[0:8] + browsertrix_home = os.path.abspath(to_enrich.get_tmp_dir()) + cmd = [ + "docker", "run", + "--rm", # delete container once it has completed running + "-v", f"{browsertrix_home}:/crawls/", + # "-it", # this leads to "the input device is not a TTY" + "webrecorder/browsertrix-crawler", "crawl", + "--url", url, + "--scopeType", "page", + "--generateWACZ", + "--text", + "--collection", collection, + "--behaviors", "autoscroll,autoplay,autofetch,siteSpecific", + "--behaviorTimeout", str(self.timeout), + "--timeout", str(self.timeout) + ] + if self.profile: + profile_fn = os.path.join(browsertrix_home, "profile.tar.gz") + shutil.copyfile(self.profile, profile_fn) + # TODO: test which is right + cmd.extend(["--profile", profile_fn]) + # cmd.extend(["--profile", "/crawls/profile.tar.gz"]) + + try: + logger.info(f"Running browsertrix-crawler: {' '.join(cmd)}") + subprocess.run(cmd, check=True) + except Exception as e: + logger.error(f"WACZ generation failed: {e}") + return False + + filename = os.path.join(browsertrix_home, "collections", collection, f"{collection}.wacz") + if not os.path.exists(filename): + logger.warning(f"Unable to locate and upload WACZ {filename=}") + return False + + to_enrich.add_media(Media(filename), "browsertrix") diff --git a/src/enrichers/wayback_enricher.py b/src/enrichers/wayback_enricher.py index 429f218..db53a08 100644 --- a/src/enrichers/wayback_enricher.py +++ b/src/enrichers/wayback_enricher.py @@ -1,16 +1,15 @@ -from utils import Webdriver +from archivers.archiver import Archiverv2 from . import Enricher from metadata import Metadata from loguru import logger -from selenium.common.exceptions import TimeoutException import time, requests -class WaybackEnricher(Enricher): +class WaybackArchiverEnricher(Enricher, Archiverv2): """ Submits the current URL to the webarchive and returns a job_id or completed archive """ - name = "wayback_enricher" + name = "wayback_archiver_enricher" def __init__(self, config: dict) -> None: # without this STEP.__init__ is not called @@ -26,9 +25,19 @@ class WaybackEnricher(Enricher): "secret": {"default": None, "help": "wayback API secret. to get credentials visit https://archive.org/account/s3.php"} } - def enrich(self, to_enrich: Metadata) -> None: + def download(self, item: Metadata) -> Metadata: + result = Metadata() + result.merge(item) + if self.enrich(result): + return result.success("wayback") + + def enrich(self, to_enrich: Metadata) -> bool: url = to_enrich.get_url() - logger.debug(f"Enriching wayback for {url=}") + logger.debug(f"calling wayback for {url=}") + + if to_enrich.get("wayback"): + logger.info(f"Wayback enricher had already been executed: {to_enrich.get('wayback')}") + return True ia_headers = { "Accept": "application/json", @@ -39,10 +48,13 @@ class WaybackEnricher(Enricher): if r.status_code != 200: logger.error(em := f"Internet archive failed with status of {r.status_code}: {r.json()}") to_enrich.set("wayback", em) - return + return False # check job status - job_id = r.json()['job_id'] + job_id = r.json().get('job_id') + if not job_id: + logger.error(f"Wayback failed with {r.json()}") + return False # waits at most timeout seconds until job is completed, otherwise only enriches the job_id information start_time = time.time() @@ -50,12 +62,15 @@ class WaybackEnricher(Enricher): attempt = 1 while not wayback_url and time.time() - start_time <= self.timeout: try: - logger.debug(f"GETting status for {job_id=} on {url=} ({attempt=})") r_status = requests.get(f'https://web.archive.org/save/status/{job_id}', headers=ia_headers) r_json = r_status.json() if r_status.status_code == 200 and r_json['status'] == 'success': wayback_url = f"https://web.archive.org/web/{r_json['timestamp']}/{r_json['original_url']}" + elif r_status.status_code != 200 or r_json['status'] != 'pending': + logger.error(f"Wayback failed with {r_json}") + return False + except Exception as e: logger.warning(f"error fetching status for {url=} due to: {e}") if not wayback_url: @@ -66,4 +81,5 @@ class WaybackEnricher(Enricher): to_enrich.set("wayback", wayback_url) else: to_enrich.set("wayback", {"job_id": job_id, "check_status": f'https://web.archive.org/save/status/{job_id}'}) - to_enrich.set("wayback lookup", f"https://web.archive.org/web/*/{url}") + to_enrich.set("check wayback", f"https://web.archive.org/web/*/{url}") + return True diff --git a/src/formatters/templates/html_template.html b/src/formatters/templates/html_template.html index 9c3b54e..4855441 100644 --- a/src/formatters/templates/html_template.html +++ b/src/formatters/templates/html_template.html @@ -162,7 +162,6 @@ {% endfor %} -

    Made with bellingcat/auto-archiver