From 93be1af93f4efb22d2335c9c346349b7fae2f503 Mon Sep 17 00:00:00 2001
From: msramalho <19508417+msramalho@users.noreply.github.com>
Date: Tue, 18 Oct 2022 15:45:10 +0100
Subject: [PATCH 001/190] adds instagram post/profile
---
.gitignore | 3 +-
Pipfile | 4 +
Pipfile.lock | 78 +++++++++++++------
archivers/__init__.py | 3 +-
archivers/instagram_archiver.py | 128 ++++++++++++++++++++++++++++++++
archivers/telethon_archiver.py | 4 +-
auto_archive.py | 5 +-
configs/__init__.py | 3 +-
configs/config.py | 11 +++
configs/instagram_config.py | 8 ++
example.config.yaml | 5 ++
11 files changed, 220 insertions(+), 32 deletions(-)
create mode 100644 archivers/instagram_archiver.py
create mode 100644 configs/instagram_config.py
diff --git a/.gitignore b/.gitignore
index 4d19b9e..04b03ee 100644
--- a/.gitignore
+++ b/.gitignore
@@ -21,4 +21,5 @@ gd-token.json
credentials.json
secrets/*
browsertrix/*
-browsertrix-tmp/*
\ No newline at end of file
+browsertrix-tmp/*
+instaloader/*
\ No newline at end of file
diff --git a/Pipfile b/Pipfile
index 88fad6a..aa04ea4 100644
--- a/Pipfile
+++ b/Pipfile
@@ -25,6 +25,10 @@ pyyaml = "*"
dateparser = "*"
vk-url-scraper = "*"
python-twitter-v2 = "*"
+instaloader = "*"
[requires]
python_version = "3.9"
+
+[dev-packages]
+autopep8 = "*"
diff --git a/Pipfile.lock b/Pipfile.lock
index 271a661..e2d1b1b 100644
--- a/Pipfile.lock
+++ b/Pipfile.lock
@@ -1,7 +1,7 @@
{
"_meta": {
"hash": {
- "sha256": "1ed953d08e31d891de0f887e520f12025d109a20718b27dd8f9b361f73c95651"
+ "sha256": "bd987e7237c7e32d2dffb295db633f5a022ce1a718435d11d8ac303c9e37a4d3"
},
"pipfile-spec": 6,
"requires": {
@@ -29,7 +29,7 @@
"sha256:01c7bf666359b4967d2cda0000cc2e4af16a0ae098cbffcb8472fb9e8ad6585b",
"sha256:6ebb3d106c12920aaae42ccb6f787ef5eefdcdd166ea3d628fa8476abe712144"
],
- "markers": "python_full_version >= '3.5.0'",
+ "markers": "python_version >= '3.5'",
"version": "==1.10"
},
"attrs": {
@@ -37,7 +37,7 @@
"sha256:29adc2665447e5191d0e7c568fde78b21f9672d344281d0c6e1ab085429b22b6",
"sha256:86efa402f67bf2df34f51a335487cf46b1ec130d02b8d39fd248abfd30da551c"
],
- "markers": "python_full_version >= '3.5.0'",
+ "markers": "python_version >= '3.5'",
"version": "==22.1.0"
},
"authlib": {
@@ -159,7 +159,7 @@
"sha256:84c85a9078b11105f04f3036a9482ae10e4621616db313fe045dd24743a0820d",
"sha256:fe86415d55e84719d75f8b69414f6438ac3547d2078ab91b67e779ef69378412"
],
- "markers": "python_version >= '3.6'",
+ "markers": "python_full_version >= '3.6.0'",
"version": "==2022.6.15"
},
"cffi": {
@@ -236,7 +236,7 @@
"sha256:2857e29ff0d34db842cd7ca3230549d1a697f96ee6d3fb071cfa6c7393832597",
"sha256:6881edbebdb17b39b4eaaa821b438bf6eddffb4468cf344f09f89def34a8b1df"
],
- "markers": "python_full_version >= '3.5.0'",
+ "markers": "python_version >= '3.5'",
"version": "==2.0.12"
},
"click": {
@@ -290,7 +290,7 @@
"sha256:d4ef6cc305394ed669d4d9eebf10d3a101059bdcf2669c366ec1d14e4fb227bd",
"sha256:d9e69ae01f99abe6ad646947bba8941e896cb3aa805be2597a0400e0764b5818"
],
- "markers": "python_version >= '3.6'",
+ "markers": "python_full_version >= '3.6.0'",
"version": "==38.0.1"
},
"dataclasses-json": {
@@ -298,7 +298,7 @@
"sha256:bc285b5f892094c3a53d558858a88553dd6a61a11ab1a8128a0e554385dcc5dd",
"sha256:c2c11bc8214fbf709ffc369d11446ff6945254a7f09128154a7620613d8fda90"
],
- "markers": "python_version >= '3.6'",
+ "markers": "python_full_version >= '3.6.0'",
"version": "==0.5.7"
},
"dateparser": {
@@ -425,9 +425,16 @@
"sha256:84d9dd047ffa80596e0f246e2eab0b391788b0503584e8945f2368256d2735ff",
"sha256:9d643ff0a55b762d5cdb124b8eaa99c66322e2157b69160bc32796e824360e6d"
],
- "markers": "python_full_version >= '3.5.0'",
+ "markers": "python_version >= '3.5'",
"version": "==3.3"
},
+ "instaloader": {
+ "hashes": [
+ "sha256:ba925a87e2c305a3d24173d1bb0457d5a7e2e77dbac7206eeeb46f9104ecb08e"
+ ],
+ "index": "pypi",
+ "version": "==4.9.5"
+ },
"itsdangerous": {
"hashes": [
"sha256:2c2349112351b88699d8d4b6b075022c0808887cb7ad10069318a8b0bc88db44",
@@ -625,7 +632,7 @@
"sha256:8139f29aac13e25d502680e9e19963e83f16838d48a0d71c287fe40e7067fbca",
"sha256:9859c40929662bec5d64f34d01c99e093149682a3f38915dc0655d5a633dd918"
],
- "markers": "python_version >= '3.6'",
+ "markers": "python_full_version >= '3.6.0'",
"version": "==3.2.2"
},
"outcome": {
@@ -641,7 +648,7 @@
"sha256:dd47c42927d89ab911e606518907cc2d3a1f38bbd026385970643f9c5b8ecfeb",
"sha256:ef103e05f519cdc783ae24ea4e2e0f508a9c99b2d4969652eed6a2e1ea5bd522"
],
- "markers": "python_version >= '3.6'",
+ "markers": "python_full_version >= '3.6.0'",
"version": "==21.3"
},
"protobuf": {
@@ -754,7 +761,7 @@
"sha256:56a8508ae95f98e2b9bdf93a6be5ae3f7d8af858b43e02c5a2ff083726be40c1",
"sha256:f643f331ab57ba3c9d89212ee4a2dabc6e94f117cf4eefde99a0574720d14c42"
],
- "markers": "python_version >= '3.6'",
+ "markers": "python_full_version >= '3.6.0'",
"version": "==2.13.0"
},
"pyparsing": {
@@ -786,7 +793,7 @@
"sha256:b7e3b04a59693c42c36f9ab1cc2acc46fa5df8c78e178fc33a8d4cd05c8d498f",
"sha256:d92a187be61fe482e4fd675b6d52200e7be63a12b724abbf931a40ce4fa92938"
],
- "markers": "python_full_version >= '3.5.0'",
+ "markers": "python_version >= '3.5'",
"version": "==0.20.0"
},
"python-slugify": {
@@ -807,10 +814,9 @@
},
"pytz": {
"hashes": [
- "sha256:2c0784747071402c6e99f0bafdb7da0fa22645f06554c7ae06bf6358897e9c91",
- "sha256:48ce799d83b6f8aab2020e369b627446696619e79645419610b9facd909b3174"
+ "sha256:335ab46900b1465e714b4fda4963d87363264eb662aab5e65da039c25f1f5b22"
],
- "version": "==2022.4"
+ "version": "==2022.5"
},
"pytz-deprecation-shim": {
"hashes": [
@@ -943,7 +949,7 @@
"sha256:fbc88d3ba402b5d041d204ec2449c4078898f89c4a6e6f0ed1c1a510ef1e221d",
"sha256:fbd3fe37353c62fd0eb19fb76f78aa693716262bcd5f9c14bb9e5aca4b3f0dc4"
],
- "markers": "python_version >= '3.6'",
+ "markers": "python_full_version >= '3.6.0'",
"version": "==2022.3.2"
},
"requests": {
@@ -983,7 +989,7 @@
"sha256:90260d9058e514786967344d0ef75fa8727eed8a7d2e43ce9f4bcf1b536174f7",
"sha256:e38464a49c6c85d7f1351b0126661487a7e0a14a50f1675ec50eb34d4f20ef21"
],
- "markers": "python_version >= '3.6' and python_version < '4'",
+ "markers": "python_version < '4' and python_full_version >= '3.6.0'",
"version": "==4.9"
},
"s3transfer": {
@@ -1037,7 +1043,7 @@
"sha256:3b2503d3c7084a42b1ebd08116e5f81aadfaea95863628c80a3b774a11b7c759",
"sha256:fc53893b3da2c33de295667a0e19f078c14bf86544af307354de5fcf12a3f30d"
],
- "markers": "python_version >= '3.6'",
+ "markers": "python_full_version >= '3.6.0'",
"version": "==2.3.2.post1"
},
"telethon": {
@@ -1083,7 +1089,7 @@
"sha256:5b558f6e83cc20a37c3b61202476c5295d1addf57bd65543364e0337e37ed2bc",
"sha256:a3d34de8fac26023eee701ed1e7bf4da9a8326b61a62934ec9e53b64970fd8fe"
],
- "markers": "python_full_version >= '3.5.0'",
+ "markers": "python_version >= '3.5'",
"version": "==0.9.2"
},
"typing-extensions": {
@@ -1106,7 +1112,7 @@
"sha256:323161b22b7802fdc78f20ca5f6073639c64f1a7227c40cd3e19fd1d0ce6650a",
"sha256:e15b2b3005e2546108af42a0eb4ccab4d9e225e2dfbf4f77aad50c70a4b1f3ab"
],
- "markers": "python_version >= '3.6'",
+ "markers": "python_full_version >= '3.6.0'",
"version": "==2022.5"
},
"tzlocal": {
@@ -1114,7 +1120,7 @@
"sha256:89885494684c929d9191c57aa27502afc87a579be5cdd3225c77c463ea043745",
"sha256:ee5842fa3a795f023514ac2d801c4a81d1743bbe642e3940143326b3a00addd7"
],
- "markers": "python_version >= '3.6'",
+ "markers": "python_full_version >= '3.6.0'",
"version": "==4.2"
},
"uritemplate": {
@@ -1122,11 +1128,10 @@
"sha256:4346edfc5c3b79f694bccd6d6099a322bbeb628dbf2cd86eea55a456ce5124f0",
"sha256:830c08b8d99bdd312ea4ead05994a38e8936266f84b9a7878232db50b044e02e"
],
- "markers": "python_version >= '3.6'",
+ "markers": "python_full_version >= '3.6.0'",
"version": "==4.1.1"
},
"urllib3": {
- "extras": [],
"hashes": [
"sha256:44ece4d53fb1706f667c9bd1c648f5469a2ec925fcf3a776667042d645472c14",
"sha256:aabaf16477806a5e1dd19aa41f8c2b7950dd3c746362d7e3223dbe6de6ac448e"
@@ -1228,5 +1233,30 @@
"version": "==2022.5.18"
}
},
- "develop": {}
+ "develop": {
+ "autopep8": {
+ "hashes": [
+ "sha256:6f09e90a2be784317e84dc1add17ebfc7abe3924239957a37e5040e27d812087",
+ "sha256:ca9b1a83e53a7fad65d731dc7a2a2d50aa48f43850407c59f6a1a306c4201142"
+ ],
+ "index": "pypi",
+ "version": "==1.7.0"
+ },
+ "pycodestyle": {
+ "hashes": [
+ "sha256:2c9607871d58c76354b697b42f5d57e1ada7d261c261efac224b664affdc5785",
+ "sha256:d1735fc58b418fd7c5f658d28d943854f8a849b01a5d0a1e6f3f3fdd0166804b"
+ ],
+ "markers": "python_version >= '3.6'",
+ "version": "==2.9.1"
+ },
+ "toml": {
+ "hashes": [
+ "sha256:806143ae5bfb6a3c6e736a764057db0e6a0e05e338b5630894a5f779cabb4f9b",
+ "sha256:b3bda1d108d5dd99f4a20d24d9c348e91c4db7ab1b749200bded2f839ccbe68f"
+ ],
+ "markers": "python_version >= '2.6' and python_version not in '3.0, 3.1, 3.2, 3.3'",
+ "version": "==0.10.2"
+ }
+ }
}
diff --git a/archivers/__init__.py b/archivers/__init__.py
index 403ebea..7f51e39 100644
--- a/archivers/__init__.py
+++ b/archivers/__init__.py
@@ -7,4 +7,5 @@ from .wayback_archiver import WaybackArchiver
from .youtubedl_archiver import YoutubeDLArchiver
from .twitter_archiver import TwitterArchiver
from .vk_archiver import VkArchiver
-from .twitter_api_archiver import TwitterApiArchiver
\ No newline at end of file
+from .twitter_api_archiver import TwitterApiArchiver
+from .instagram_archiver import InstagramArchiver
\ No newline at end of file
diff --git a/archivers/instagram_archiver.py b/archivers/instagram_archiver.py
new file mode 100644
index 0000000..1539527
--- /dev/null
+++ b/archivers/instagram_archiver.py
@@ -0,0 +1,128 @@
+import re, os, shutil, html
+import instaloader # https://instaloader.github.io/as-module.html
+from loguru import logger
+
+from .base_archiver import Archiver, ArchiveResult
+from configs import Config
+from storages import Storage
+
+
+class InstagramArchiver(Archiver):
+ """
+ Uses Instaloader to download either a post (inc images, videos, text) or as much as possible from a profile (posts, stories, highlights, )
+ """
+ name = "instagram"
+ DOWNLOAD_FOLDER = "instaloader"
+ # NB: post should be tested before profile
+ # https://regex101.com/r/MGPquX/1
+ post_pattern = re.compile(r"(?:(?:http|https):\/\/)?(?:www.)?(?:instagram.com|instagr.am|instagr.com)\/(?:p|reel)\/(\w+)")
+ # https://regex101.com/r/6Wbsxa/1
+ profile_pattern = re.compile(r"(?:(?:http|https):\/\/)?(?:www.)?(?:instagram.com|instagr.am|instagr.com)\/(\w+)")
+
+ def __init__(self, storage: Storage, config: Config):
+ super().__init__(storage, config)
+ self.insta = instaloader.Instaloader(download_geotags=True, download_comments=True, compress_json=False, dirname_pattern=self.DOWNLOAD_FOLDER, filename_pattern="{date_utc}_UTC_{target}__{typename}")
+ if config.instagram_config:
+ self.insta.login(config.instagram_config.username, config.instagram_config.password)
+
+ def download(self, url, check_if_exists=False):
+ post_matches = self.post_pattern.findall(url)
+ profile_matches = self.profile_pattern.findall(url)
+
+ # return if not a valid instagram link
+ if not len(post_matches) and not len(profile_matches):
+ return
+
+ # check if already uploaded
+ key = self.get_html_key(url)
+ if check_if_exists and self.storage.exists(key):
+ # only s3 storage supports storage.exists as not implemented on gd
+ cdn_url = self.storage.get_cdn_url(key)
+ screenshot = self.get_screenshot(url)
+ wacz = self.get_wacz(url)
+ return ArchiveResult(status='already archived', cdn_url=cdn_url, screenshot=screenshot, wacz=wacz)
+
+ try:
+ # process if post
+ if len(post_matches):
+ return self.download_post(url, post_matches[0])
+
+ # process if profile
+ if len(profile_matches):
+ return self.download_profile(url, profile_matches[0])
+ finally:
+ shutil.rmtree(self.DOWNLOAD_FOLDER, ignore_errors=True)
+
+ def download_post(self, url, post_id):
+ logger.debug(f"Instagram {post_id=} detected in {url=}")
+
+ post = instaloader.Post.from_shortcode(self.insta.context, post_id)
+ if self.insta.download_post(post, target=post.owner_username):
+ return self.upload_downloaded_content(url, post.title, post._asdict(), post.date)
+
+ def download_profile(self, url, username):
+ # gets posts, posts where username is tagged, igtv postss, stories, and highlights
+ logger.debug(f"Instagram {username=} detected in {url=}")
+
+ profile = instaloader.Profile.from_username(self.insta.context, username)
+ try:
+ for post in profile.get_posts():
+ try: self.insta.download_post(post, target=f"profile_post_{post.owner_username}")
+ except Exception as e: logger.error(f"Failed to download post: {post.shortcode}: {e}")
+ except Exception as e: logger.error(f"Failed profile.get_posts: {e}")
+
+ try:
+ for post in profile.get_tagged_posts():
+ try: self.insta.download_post(post, target=f"tagged_post_{post.owner_username}")
+ except Exception as e: logger.error(f"Failed to download tagged post: {post.shortcode}: {e}")
+ except Exception as e: logger.error(f"Failed profile.get_tagged_posts: {e}")
+
+ try:
+ for post in profile.get_igtv_posts():
+ try: self.insta.download_post(post, target=f"igtv_post_{post.owner_username}")
+ except Exception as e: logger.error(f"Failed to download igtv post: {post.shortcode}: {e}")
+ except Exception as e: logger.error(f"Failed profile.get_igtv_posts: {e}")
+
+ try:
+ for story in self.insta.get_stories([profile.userid]):
+ for item in story.get_items():
+ try: self.insta.download_storyitem(item, target=f"story_item_{story.owner_username}")
+ except Exception as e: logger.error(f"Failed to download story item: {item}: {e}")
+ except Exception as e: logger.error(f"Failed get_stories: {e}")
+
+ try:
+ for highlight in self.insta.get_highlights(profile.userid):
+ for item in highlight.get_items():
+ try: self.insta.download_storyitem(item, target=f"highlight_item_{highlight.owner_username}")
+ except Exception as e: logger.error(f"Failed to download highlight item: {item}: {e}")
+ except Exception as e: logger.error(f"Failed get_highlights: {e}")
+
+ return self.upload_downloaded_content(url, f"@{username}", profile._asdict(), None)
+
+ def upload_downloaded_content(self, url, title, content, date):
+ status = "success"
+ try:
+ uploaded_media = []
+ for f in os.listdir(self.DOWNLOAD_FOLDER):
+ if os.path.isfile((filename := os.path.join(self.DOWNLOAD_FOLDER, f))):
+ key = self.get_key(filename)
+ self.storage.upload(filename, key)
+ hash = self.get_hash(filename)
+ cdn_url = self.storage.get_cdn_url(key)
+ uploaded_media.append({'cdn_url': cdn_url, 'key': key, 'hash': hash})
+ assert len(uploaded_media) > 1, "No uploaded media found"
+
+ uploaded_media.sort(key=lambda m:m["key"], reverse=True)
+
+ page_cdn, page_hash, _ = self.generate_media_page_html(url, uploaded_media, html.escape(str(content)))
+ except Exception as e:
+ logger.error(f"Could not fetch instagram post {url} due to: {e}")
+ status = "error"
+ finally:
+ shutil.rmtree(self.DOWNLOAD_FOLDER, ignore_errors=True)
+
+ if status == "success":
+ screenshot = self.get_screenshot(url)
+ wacz = self.get_wacz(url)
+
+ return ArchiveResult(status=status, cdn_url=page_cdn, title=title, timestamp=date, hash=page_hash, screenshot=screenshot, wacz=wacz)
diff --git a/archivers/telethon_archiver.py b/archivers/telethon_archiver.py
index 9f9bbbf..5c147de 100644
--- a/archivers/telethon_archiver.py
+++ b/archivers/telethon_archiver.py
@@ -1,6 +1,4 @@
-import os, re
-
-import html
+import os, re, html
from loguru import logger
from telethon.sync import TelegramClient
from telethon.errors import ChannelInvalidError
diff --git a/auto_archive.py b/auto_archive.py
index 50719a3..3412b0a 100644
--- a/auto_archive.py
+++ b/auto_archive.py
@@ -4,8 +4,8 @@ from loguru import logger
from slugify import slugify
from urllib.parse import quote
-from archivers import TelethonArchiver, TelegramArchiver, TiktokArchiver, YoutubeDLArchiver, TwitterArchiver, TwitterApiArchiver, VkArchiver, WaybackArchiver, ArchiveResult, Archiver
-from utils import GWorksheet, mkdir_if_not_exists, expand_url
+from archivers import TelethonArchiver, TelegramArchiver, TiktokArchiver, YoutubeDLArchiver, TwitterArchiver, TwitterApiArchiver, VkArchiver, WaybackArchiver, InstagramArchiver, ArchiveResult, Archiver
+from utils import GWorksheet, expand_url
from configs import Config
from storages import Storage
@@ -111,6 +111,7 @@ def process_sheet(c: Config):
TelethonArchiver(storage, c),
TiktokArchiver(storage, c),
TwitterApiArchiver(storage, c),
+ InstagramArchiver(storage, c),
YoutubeDLArchiver(storage, c),
TelegramArchiver(storage, c),
TwitterArchiver(storage, c),
diff --git a/configs/__init__.py b/configs/__init__.py
index 6940ed3..1f01b62 100644
--- a/configs/__init__.py
+++ b/configs/__init__.py
@@ -3,4 +3,5 @@ from .selenium_config import SeleniumConfig
from .telethon_config import TelethonConfig
from .wayback_config import WaybackConfig
from .twitter_api_config import TwitterApiConfig
-from .vk_config import VkConfig
\ No newline at end of file
+from .vk_config import VkConfig
+from .instagram_config import InstagramConfig
\ No newline at end of file
diff --git a/configs/config.py b/configs/config.py
index 6e97dc4..80e4881 100644
--- a/configs/config.py
+++ b/configs/config.py
@@ -12,6 +12,7 @@ from .selenium_config import SeleniumConfig
from .vk_config import VkConfig
from .twitter_api_config import TwitterApiConfig
from .browsertrix_config import BrowsertrixConfig
+from .instagram_config import InstagramConfig
from storages import S3Config, S3Storage, GDStorage, GDConfig, LocalStorage, LocalConfig
@@ -180,6 +181,16 @@ class Config:
self.vk_config = None
logger.debug(f"'vk' key not present in the {self.config_file=}")
+ # instagram config
+ if "instagram" in secrets:
+ self.instagram_config = InstagramConfig(
+ username=secrets["instagram"]["username"],
+ password=secrets["instagram"]["password"]
+ )
+ else:
+ self.instagram_config = None
+ logger.debug(f"'instagram' key not present in the {self.config_file=}")
+
del self.config["secrets"] # delete to prevent leaks
def set_log_files(self):
diff --git a/configs/instagram_config.py b/configs/instagram_config.py
new file mode 100644
index 0000000..73c45bc
--- /dev/null
+++ b/configs/instagram_config.py
@@ -0,0 +1,8 @@
+
+from dataclasses import dataclass
+
+
+@dataclass
+class InstagramConfig:
+ username: str
+ password: str
diff --git a/example.config.yaml b/example.config.yaml
index e42d10f..b26f58f 100644
--- a/example.config.yaml
+++ b/example.config.yaml
@@ -67,6 +67,11 @@ secrets:
username: "phone number or email"
password: "password"
+ # instagram credentials
+ instagram:
+ username: "username"
+ password: "password"
+
google_sheets:
# local filename: defaults to service_account.json, see https://gspread.readthedocs.io/en/latest/oauth2.html#for-bots-using-service-account
service_account: "service_account.json"
From 3f121d800e0d0dd386a0943710c784357c35949f Mon Sep 17 00:00:00 2001
From: msramalho <19508417+msramalho@users.noreply.github.com>
Date: Tue, 18 Oct 2022 16:36:27 +0100
Subject: [PATCH 002/190] catch bad instagram login
---
archivers/instagram_archiver.py | 6 +++++-
1 file changed, 5 insertions(+), 1 deletion(-)
diff --git a/archivers/instagram_archiver.py b/archivers/instagram_archiver.py
index 1539527..5d2fefe 100644
--- a/archivers/instagram_archiver.py
+++ b/archivers/instagram_archiver.py
@@ -23,7 +23,11 @@ class InstagramArchiver(Archiver):
super().__init__(storage, config)
self.insta = instaloader.Instaloader(download_geotags=True, download_comments=True, compress_json=False, dirname_pattern=self.DOWNLOAD_FOLDER, filename_pattern="{date_utc}_UTC_{target}__{typename}")
if config.instagram_config:
- self.insta.login(config.instagram_config.username, config.instagram_config.password)
+ try:
+ self.insta.login(config.instagram_config.username, config.instagram_config.
+ password)
+ except Exception as e:
+ logger.error(f"Unable to finish login: {e}")
def download(self, url, check_if_exists=False):
post_matches = self.post_pattern.findall(url)
From 63f53358d3c92170c85b988d4e284e80a52443ff Mon Sep 17 00:00:00 2001
From: msramalho <19508417+msramalho@users.noreply.github.com>
Date: Tue, 18 Oct 2022 16:38:12 +0100
Subject: [PATCH 003/190] adds traceback
---
archivers/instagram_archiver.py | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/archivers/instagram_archiver.py b/archivers/instagram_archiver.py
index 5d2fefe..46b53b8 100644
--- a/archivers/instagram_archiver.py
+++ b/archivers/instagram_archiver.py
@@ -1,4 +1,4 @@
-import re, os, shutil, html
+import re, os, shutil, html, traceback
import instaloader # https://instaloader.github.io/as-module.html
from loguru import logger
@@ -27,7 +27,7 @@ class InstagramArchiver(Archiver):
self.insta.login(config.instagram_config.username, config.instagram_config.
password)
except Exception as e:
- logger.error(f"Unable to finish login: {e}")
+ logger.error(f"Unable to finish login: {e}\n{traceback.format_exc()}")
def download(self, url, check_if_exists=False):
post_matches = self.post_pattern.findall(url)
From 6c80a5b82d5df5e2e26afb646ff385318466e556 Mon Sep 17 00:00:00 2001
From: msramalho <19508417+msramalho@users.noreply.github.com>
Date: Tue, 18 Oct 2022 17:35:59 +0100
Subject: [PATCH 004/190] session file logic
---
.gitignore | 3 ++-
archivers/instagram_archiver.py | 10 +++++++++-
configs/config.py | 3 ++-
configs/instagram_config.py | 1 +
example.config.yaml | 1 +
5 files changed, 15 insertions(+), 3 deletions(-)
diff --git a/.gitignore b/.gitignore
index 04b03ee..59ed096 100644
--- a/.gitignore
+++ b/.gitignore
@@ -22,4 +22,5 @@ credentials.json
secrets/*
browsertrix/*
browsertrix-tmp/*
-instaloader/*
\ No newline at end of file
+instaloader/*
+instaloader.session
\ No newline at end of file
diff --git a/archivers/instagram_archiver.py b/archivers/instagram_archiver.py
index 46b53b8..d7a3989 100644
--- a/archivers/instagram_archiver.py
+++ b/archivers/instagram_archiver.py
@@ -26,8 +26,16 @@ class InstagramArchiver(Archiver):
try:
self.insta.login(config.instagram_config.username, config.instagram_config.
password)
+ #TODO: wait for this issue to be fixed https://github.com/instaloader/instaloader/issues/1758
+ self.insta.save_session_to_file(config.instagram_config.session_file)
except Exception as e:
- logger.error(f"Unable to finish login: {e}\n{traceback.format_exc()}")
+ logger.error(f"Unable to finish login (retrying from file): {e}\n{traceback.format_exc()}")
+ try:
+ self.insta.load_session_from_file(config.instagram_config.username, config.instagram_config.session_file)
+ except Exception as e2:
+ logger.error(f"Unable to login from session file: {e2}\n{traceback.format_exc()}")
+
+
def download(self, url, check_if_exists=False):
post_matches = self.post_pattern.findall(url)
diff --git a/configs/config.py b/configs/config.py
index 80e4881..372b3d7 100644
--- a/configs/config.py
+++ b/configs/config.py
@@ -185,7 +185,8 @@ class Config:
if "instagram" in secrets:
self.instagram_config = InstagramConfig(
username=secrets["instagram"]["username"],
- password=secrets["instagram"]["password"]
+ password=secrets["instagram"]["password"],
+ session_file=secrets["instagram"].get("session_file", "instaloader.session")
)
else:
self.instagram_config = None
diff --git a/configs/instagram_config.py b/configs/instagram_config.py
index 73c45bc..a9f26b4 100644
--- a/configs/instagram_config.py
+++ b/configs/instagram_config.py
@@ -6,3 +6,4 @@ from dataclasses import dataclass
class InstagramConfig:
username: str
password: str
+ session_file: str
diff --git a/example.config.yaml b/example.config.yaml
index b26f58f..7cd4ecb 100644
--- a/example.config.yaml
+++ b/example.config.yaml
@@ -71,6 +71,7 @@ secrets:
instagram:
username: "username"
password: "password"
+ session_file: "instaloader.session" # <- default value
google_sheets:
# local filename: defaults to service_account.json, see https://gspread.readthedocs.io/en/latest/oauth2.html#for-bots-using-service-account
From 54c572258c314d4c93019c7625f0affcec07f528 Mon Sep 17 00:00:00 2001
From: msramalho <19508417+msramalho@users.noreply.github.com>
Date: Tue, 18 Oct 2022 17:46:40 +0100
Subject: [PATCH 005/190] fix tty
---
archivers/base_archiver.py | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/archivers/base_archiver.py b/archivers/base_archiver.py
index 96e0fbf..40ad861 100644
--- a/archivers/base_archiver.py
+++ b/archivers/base_archiver.py
@@ -215,7 +215,7 @@ class Archiver(ABC):
cmd = [
"docker", "run",
"-v", f"{browsertrix_home}:/crawls/",
- "-it",
+ # "-it", # this leads to "the input device is not a TTY"
"webrecorder/browsertrix-crawler", "crawl",
"--url", url,
"--scopeType", "page",
From 4d2b7b404097ce6251ea04510ea3082a42cdffdd Mon Sep 17 00:00:00 2001
From: msramalho <19508417+msramalho@users.noreply.github.com>
Date: Wed, 19 Oct 2022 11:27:17 +0100
Subject: [PATCH 006/190] reverse order of login attempts
---
archivers/instagram_archiver.py | 14 +++++++-------
1 file changed, 7 insertions(+), 7 deletions(-)
diff --git a/archivers/instagram_archiver.py b/archivers/instagram_archiver.py
index d7a3989..a2b1147 100644
--- a/archivers/instagram_archiver.py
+++ b/archivers/instagram_archiver.py
@@ -24,16 +24,16 @@ class InstagramArchiver(Archiver):
self.insta = instaloader.Instaloader(download_geotags=True, download_comments=True, compress_json=False, dirname_pattern=self.DOWNLOAD_FOLDER, filename_pattern="{date_utc}_UTC_{target}__{typename}")
if config.instagram_config:
try:
- self.insta.login(config.instagram_config.username, config.instagram_config.
- password)
- #TODO: wait for this issue to be fixed https://github.com/instaloader/instaloader/issues/1758
- self.insta.save_session_to_file(config.instagram_config.session_file)
+ self.insta.load_session_from_file(config.instagram_config.username, config.instagram_config.session_file)
except Exception as e:
- logger.error(f"Unable to finish login (retrying from file): {e}\n{traceback.format_exc()}")
+ logger.error(f"Unable to login from session file: {e}\n{traceback.format_exc()}")
try:
- self.insta.load_session_from_file(config.instagram_config.username, config.instagram_config.session_file)
+ self.insta.login(config.instagram_config.username, config.instagram_config.
+ password)
+ #TODO: wait for this issue to be fixed https://github.com/instaloader/instaloader/issues/1758
+ self.insta.save_session_to_file(config.instagram_config.session_file)
except Exception as e2:
- logger.error(f"Unable to login from session file: {e2}\n{traceback.format_exc()}")
+ logger.error(f"Unable to finish login (retrying from file): {e2}\n{traceback.format_exc()}")
From ac4f1b6132efa0d038463d242d48cc9456af5023 Mon Sep 17 00:00:00 2001
From: msramalho <19508417+msramalho@users.noreply.github.com>
Date: Wed, 19 Oct 2022 11:37:04 +0100
Subject: [PATCH 007/190] readme updates
---
README.md | 1 +
1 file changed, 1 insertion(+)
diff --git a/README.md b/README.md
index e420629..dbaabc2 100644
--- a/README.md
+++ b/README.md
@@ -168,6 +168,7 @@ graph TD
A(Archiver) -->|parent of| B(TelethonArchiver)
A -->|parent of| C(TiktokArchiver)
A -->|parent of| D(YoutubeDLArchiver)
+ A -->|parent of| D(InstagramArchiver)
A -->|parent of| E(TelegramArchiver)
A -->|parent of| F(TwitterArchiver)
A -->|parent of| G(VkArchiver)
From 22363cb8b9c22c4d549fd7ee5187de69875edee5 Mon Sep 17 00:00:00 2001
From: msramalho <19508417+msramalho@users.noreply.github.com>
Date: Thu, 20 Oct 2022 11:59:23 +0100
Subject: [PATCH 008/190] adds information on browsertrix usage
---
README.md | 4 ++--
example.config.yaml | 2 +-
2 files changed, 3 insertions(+), 3 deletions(-)
diff --git a/README.md b/README.md
index dbaabc2..8bdc7d5 100644
--- a/README.md
+++ b/README.md
@@ -18,8 +18,8 @@ You also need:
3. [firefox](https://www.mozilla.org/en-US/firefox/new/) and [geckodriver](https://github.com/mozilla/geckodriver/releases) on a path folder like `/usr/local/bin`.
4. [fonts-noto](https://fonts.google.com/noto) to deal with multiple unicode characters during selenium/geckodriver's screenshots: `sudo apt install fonts-noto -y`.
5. Internet Archive credentials can be retrieved from https://archive.org/account/s3.php.
-6. If you would like to take archival [WACZ](https://specs.webrecorder.net/wacz/1.1.1/) snapshots using [browsertrix-crawler](https://github.com/webrecorder/browsertrix-crawler)
- in addition to screenshots you will need to install [Docker](https://www.docker.com/).
+6. If you would like to take archival [WACZ](https://specs.webrecorder.net/wacz/1.1.1/) snapshots using [browsertrix-crawler](https://github.com/webrecorder/browsertrix-crawler) in addition to screenshots you will need to install [Docker](https://www.docker.com/).
+ 1. To improve the websites browsertrix can archive you can also create a custom profile by running `docker run -p 9222:9222 -p 9223:9223 -v $PWD/browsertrix/crawls/profiles:/crawls/profiles/ -it webrecorder/browsertrix-crawler create-login-profile --interactive --url "https://youtube.com"`, going to [http://localhost:9223/](http://localhost:9223/) and accepting the cookies prompt on youtube, and then navigating to other websites and logging in as per your needs, so as to access more publicly blocked content, and then specifying the created `profile.tar.gz` in your config file under `execution.browsertrix.profile`.
### Configuration file
Configuration is done via a config.yaml file (see [example.config.yaml](example.config.yaml)) and some properties of that file can be overwritten via command line arguments. Here is the current result from running the `python auto_archive.py --help`:
diff --git a/example.config.yaml b/example.config.yaml
index 7cd4ecb..c6ad8f8 100644
--- a/example.config.yaml
+++ b/example.config.yaml
@@ -134,6 +134,6 @@ execution:
screenshot: screenshot
hash: hash
wacz: wacz
- # if you want the replaypage to work, make sure to allow CORS on your bucket
+ # if you want the replaypage to work, make sure to allow CORS on your bucket, see https://replayweb.page/docs/embedding#cors-restrictions
replaywebpage: replaywebpage
From 7a700acd8e59d3ce7c4bc53479c6270bc4721bd2 Mon Sep 17 00:00:00 2001
From: msramalho <19508417+msramalho@users.noreply.github.com>
Date: Mon, 31 Oct 2022 10:35:01 +0000
Subject: [PATCH 009/190] hotfix for #65
---
archivers/base_archiver.py | 8 ++++++++
1 file changed, 8 insertions(+)
diff --git a/archivers/base_archiver.py b/archivers/base_archiver.py
index 40ad861..076b7ca 100644
--- a/archivers/base_archiver.py
+++ b/archivers/base_archiver.py
@@ -240,6 +240,14 @@ class Archiver(ABC):
except Exception as e:
logger.error(f"WACZ generation failed: {e}")
return
+ try:
+ # TODO: is there a better way to manage the containers, like reusing?
+ # https://github.com/bellingcat/auto-archiver/issues/65
+ cmd_clean_docker = 'docker rm $(docker stop $(docker ps -a -q --filter ancestor=webrecorder/browsertrix-crawler --format="{{.ID}}"))'
+ logger.info(f"Cleaning docker containers: {cmd_clean_docker}")
+ os.system(cmd_clean_docker)
+ except Exception as e:
+ logger.error(f"Could not clean dangling docker containers: {e}")
filename = os.path.join(browsertrix_home, "collections", collection, f"{collection}.wacz")
From 29e1872e872573de9cd9188d093b78023c1765b4 Mon Sep 17 00:00:00 2001
From: msramalho <19508417+msramalho@users.noreply.github.com>
Date: Mon, 31 Oct 2022 10:41:27 +0000
Subject: [PATCH 010/190] fix: rm stopped containers only
---
archivers/base_archiver.py | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/archivers/base_archiver.py b/archivers/base_archiver.py
index 076b7ca..50fa588 100644
--- a/archivers/base_archiver.py
+++ b/archivers/base_archiver.py
@@ -243,7 +243,7 @@ class Archiver(ABC):
try:
# TODO: is there a better way to manage the containers, like reusing?
# https://github.com/bellingcat/auto-archiver/issues/65
- cmd_clean_docker = 'docker rm $(docker stop $(docker ps -a -q --filter ancestor=webrecorder/browsertrix-crawler --format="{{.ID}}"))'
+ cmd_clean_docker = 'docker rm $(docker ps -a -q --filter ancestor=webrecorder/browsertrix-crawler --format="{{.ID}}")'
logger.info(f"Cleaning docker containers: {cmd_clean_docker}")
os.system(cmd_clean_docker)
except Exception as e:
From c8fa077df7d01de3c62c2b64e3fc2830dc9b22fd Mon Sep 17 00:00:00 2001
From: msramalho <19508417+msramalho@users.noreply.github.com>
Date: Mon, 31 Oct 2022 17:10:55 +0000
Subject: [PATCH 011/190] docker initial files
---
.dockerignore | 17 +++++++++++++++++
Dockerfile | 20 ++++++++++++++++++++
2 files changed, 37 insertions(+)
create mode 100644 .dockerignore
create mode 100644 Dockerfile
diff --git a/.dockerignore b/.dockerignore
new file mode 100644
index 0000000..a4d2bbb
--- /dev/null
+++ b/.dockerignore
@@ -0,0 +1,17 @@
+logs/
+browsertrix-tmp/
+tmp*/
+temp/
+.DS_Store
+__pycache__/
+local_archive/
+config*.json
+config.json
+*.env
+credentials.json
+secrets/
+instaloader/
+instaloader.session
+vk_config*.json
+anon*
+geckodriver.log
diff --git a/Dockerfile b/Dockerfile
new file mode 100644
index 0000000..61a82c2
--- /dev/null
+++ b/Dockerfile
@@ -0,0 +1,20 @@
+From python:3.10
+
+WORKDIR /usr/src/app
+
+COPY . .
+
+
+# TODO: use custom ffmpeg builds instead of apt-get install
+RUN pip install --upgrade pip && \
+ pip install pipenv && \
+ apt-get update && \
+ apt-get install -y gcc ffmpeg fonts-noto firefox-esr && \
+ wget https://github.com/mozilla/geckodriver/releases/download/v0.32.0/geckodriver-v0.32.0-linux64.tar.gz && \
+ tar -xvzf geckodriver* -C /usr/bin && \
+ chmod +x /usr/bin/geckodriver && \
+ rm geckodriver-v* && \
+ export PATH=$PATH:/usr/bin/ && \
+ pipenv install --python=3.10
+
+CMD ["pipenv", "run", "python", "auto_archive.py"]
\ No newline at end of file
From a9df992f6663b8f51672186f8c788a682df7659a Mon Sep 17 00:00:00 2001
From: msramalho <19508417+msramalho@users.noreply.github.com>
Date: Wed, 2 Nov 2022 16:51:32 +0000
Subject: [PATCH 012/190] WiP
---
Dockerfile | 18 +++++++++++-------
1 file changed, 11 insertions(+), 7 deletions(-)
diff --git a/Dockerfile b/Dockerfile
index 61a82c2..b2de36a 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -1,6 +1,6 @@
From python:3.10
-WORKDIR /usr/src/app
+WORKDIR /app
COPY . .
@@ -9,12 +9,16 @@ COPY . .
RUN pip install --upgrade pip && \
pip install pipenv && \
apt-get update && \
- apt-get install -y gcc ffmpeg fonts-noto firefox-esr && \
- wget https://github.com/mozilla/geckodriver/releases/download/v0.32.0/geckodriver-v0.32.0-linux64.tar.gz && \
- tar -xvzf geckodriver* -C /usr/bin && \
- chmod +x /usr/bin/geckodriver && \
+ apt-get install -y gcc ffmpeg fonts-noto firefox-esr
+
+RUN wget https://github.com/mozilla/geckodriver/releases/download/v0.32.0/geckodriver-v0.32.0-linux64.tar.gz && \
+ tar -xvzf geckodriver* -C /usr/local/bin && \
+ chmod +x /usr/local/bin/geckodriver && \
rm geckodriver-v* && \
- export PATH=$PATH:/usr/bin/ && \
pipenv install --python=3.10
-CMD ["pipenv", "run", "python", "auto_archive.py"]
\ No newline at end of file
+# CMD ["pipenv", "run", "python", "auto_archive.py"]
+ENTRYPOINT ["pipenv", "run", "python", "auto_archive.py"]
+
+# should be executed with 2 volumes
+# docker run -v /var/run/docker.sock:/var/run/docker.sock -v $PWD/secrets:/app/ aa --help
\ No newline at end of file
From 50e03ba565918ecd4b2c3bf56b62ea64d141a676 Mon Sep 17 00:00:00 2001
From: msramalho <19508417+msramalho@users.noreply.github.com>
Date: Wed, 2 Nov 2022 16:59:44 +0000
Subject: [PATCH 013/190] closes #65 with simpler solution
---
archivers/base_archiver.py | 9 +--------
1 file changed, 1 insertion(+), 8 deletions(-)
diff --git a/archivers/base_archiver.py b/archivers/base_archiver.py
index 50fa588..3dc5ba1 100644
--- a/archivers/base_archiver.py
+++ b/archivers/base_archiver.py
@@ -214,6 +214,7 @@ class Archiver(ABC):
browsertrix_home = os.path.join(os.getcwd(), "browsertrix-tmp")
cmd = [
"docker", "run",
+ "--rm", # delete container once it has completed running
"-v", f"{browsertrix_home}:/crawls/",
# "-it", # this leads to "the input device is not a TTY"
"webrecorder/browsertrix-crawler", "crawl",
@@ -240,14 +241,6 @@ class Archiver(ABC):
except Exception as e:
logger.error(f"WACZ generation failed: {e}")
return
- try:
- # TODO: is there a better way to manage the containers, like reusing?
- # https://github.com/bellingcat/auto-archiver/issues/65
- cmd_clean_docker = 'docker rm $(docker ps -a -q --filter ancestor=webrecorder/browsertrix-crawler --format="{{.ID}}")'
- logger.info(f"Cleaning docker containers: {cmd_clean_docker}")
- os.system(cmd_clean_docker)
- except Exception as e:
- logger.error(f"Could not clean dangling docker containers: {e}")
filename = os.path.join(browsertrix_home, "collections", collection, f"{collection}.wacz")
From 629cd586db7e8119f9c6f6d4daed5967d1c71c9c Mon Sep 17 00:00:00 2001
From: msramalho <19508417+msramalho@users.noreply.github.com>
Date: Tue, 8 Nov 2022 13:59:09 +0000
Subject: [PATCH 014/190] adds session_file for missing archivers
---
configs/config.py | 7 +++++--
configs/telethon_config.py | 1 +
configs/vk_config.py | 1 +
example.config.yaml | 4 ++++
4 files changed, 11 insertions(+), 2 deletions(-)
diff --git a/configs/config.py b/configs/config.py
index 372b3d7..01b8173 100644
--- a/configs/config.py
+++ b/configs/config.py
@@ -34,6 +34,7 @@ class Config:
def __init__(self):
self.parser = self.get_argument_parser()
self.folder = ""
+ self.is_docker = bool(os.environ.get("IS_DOCKER", 0))
def parse(self):
self.args = self.parser.parse_args()
@@ -152,7 +153,8 @@ class Config:
self.telegram_config = TelethonConfig(
api_id=secrets["telegram"]["api_id"],
api_hash=secrets["telegram"]["api_hash"],
- bot_token=secrets["telegram"].get("bot_token", None)
+ bot_token=secrets["telegram"].get("bot_token", None),
+ session_file=secrets["telegram"].get("session_file", "./anon")
)
else:
self.telegram_config = None
@@ -175,7 +177,8 @@ class Config:
if "vk" in secrets:
self.vk_config = VkConfig(
username=secrets["vk"]["username"],
- password=secrets["vk"]["password"]
+ password=secrets["vk"]["password"],
+ session_file=secrets["vk"].get("session_file", "./vk_config.v2.json")
)
else:
self.vk_config = None
diff --git a/configs/telethon_config.py b/configs/telethon_config.py
index 3099bb5..111c7bd 100644
--- a/configs/telethon_config.py
+++ b/configs/telethon_config.py
@@ -7,3 +7,4 @@ class TelethonConfig:
api_id: str
api_hash: str
bot_token: str
+ session_file: str
diff --git a/configs/vk_config.py b/configs/vk_config.py
index db2e61c..4c3472c 100644
--- a/configs/vk_config.py
+++ b/configs/vk_config.py
@@ -6,3 +6,4 @@ from dataclasses import dataclass
class VkConfig:
username: str
password: str
+ session_file: str
diff --git a/example.config.yaml b/example.config.yaml
index c6ad8f8..857265b 100644
--- a/example.config.yaml
+++ b/example.config.yaml
@@ -50,6 +50,8 @@ secrets:
api_hash: your API hash
# optional, but allows access to more content such as large videos, talk to @botfather
bot_token: your bot-token
+ # optional, defaults to ./anon, records the telegram login session for future usage
+ session_file: "secrets/anon"
# twitter configuration - API V2 only
# if you don't provide credentials the less-effective unofficial TwitterArchiver will be used instead
@@ -66,6 +68,8 @@ secrets:
vk:
username: "phone number or email"
password: "password"
+ # optional, defaults to ./vk_config.v2.json, records VK login session for future usage
+ session_file: "secrets/vk_config.v2.json"
# instagram credentials
instagram:
From 09f47383a3478ff43ff01aa659316782154167b6 Mon Sep 17 00:00:00 2001
From: msramalho <19508417+msramalho@users.noreply.github.com>
Date: Tue, 8 Nov 2022 13:59:35 +0000
Subject: [PATCH 015/190] dockerfile improvements
---
Dockerfile | 30 ++++++++++++++++++++----------
1 file changed, 20 insertions(+), 10 deletions(-)
diff --git a/Dockerfile b/Dockerfile
index b2de36a..a9b4d7a 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -1,24 +1,34 @@
+# stage 1 - all dependencies
From python:3.10
WORKDIR /app
-COPY . .
-
-
# TODO: use custom ffmpeg builds instead of apt-get install
RUN pip install --upgrade pip && \
pip install pipenv && \
apt-get update && \
- apt-get install -y gcc ffmpeg fonts-noto firefox-esr
-
-RUN wget https://github.com/mozilla/geckodriver/releases/download/v0.32.0/geckodriver-v0.32.0-linux64.tar.gz && \
+ apt-get install -y gcc ffmpeg fonts-noto firefox-esr && \
+ wget https://github.com/mozilla/geckodriver/releases/download/v0.32.0/geckodriver-v0.32.0-linux64.tar.gz && \
tar -xvzf geckodriver* -C /usr/local/bin && \
chmod +x /usr/local/bin/geckodriver && \
- rm geckodriver-v* && \
- pipenv install --python=3.10
+ rm geckodriver-v*
+
+
+# install docker for WACZ
+RUN curl -fsSL https://get.docker.com | sh
+
+# RUN git clone https://github.com/bellingcat/auto-archiver
+# TODO: avoid copying unnecessary files, including .git
+# COPY ./src/ .
+COPY Pipfile Pipfile.lock ./
+RUN pipenv install --python=3.10 --system --deploy
+# TODO: to avoid copying pipfile lock it should be on the .dockerignore
+ENV IS_DOCKER=1
+COPY . .
# CMD ["pipenv", "run", "python", "auto_archive.py"]
-ENTRYPOINT ["pipenv", "run", "python", "auto_archive.py"]
+ENTRYPOINT ["python", "auto_archive.py"]
+# ENTRYPOINT ["docker-entrypoint.sh"]
# should be executed with 2 volumes
-# docker run -v /var/run/docker.sock:/var/run/docker.sock -v $PWD/secrets:/app/ aa --help
\ No newline at end of file
+# docker run -v /var/run/docker.sock:/var/run/docker.sock -v $PWD/secrets:/app/secrets -v $PWD/local_archive:/app/local_archive aa --help
\ No newline at end of file
From a8f7055696ee14f41a67e5487a6883c3872f274d Mon Sep 17 00:00:00 2001
From: msramalho <19508417+msramalho@users.noreply.github.com>
Date: Tue, 8 Nov 2022 13:59:59 +0000
Subject: [PATCH 016/190] reduces uncontrolled exceptions
---
Pipfile.lock | 332 ++++++++++++++++++++-----------------
archivers/base_archiver.py | 7 +-
2 files changed, 183 insertions(+), 156 deletions(-)
diff --git a/Pipfile.lock b/Pipfile.lock
index e2d1b1b..6aac097 100644
--- a/Pipfile.lock
+++ b/Pipfile.lock
@@ -42,10 +42,10 @@
},
"authlib": {
"hashes": [
- "sha256:b83cf6360c8e92b0e9df0d1f32d675790bcc4e3c03977499b1eed24dcdef4252",
- "sha256:ecf4a7a9f2508c0bb07e93a752dd3c495cfaffc20e864ef0ffc95e3f40d2abaf"
+ "sha256:2988fdf7d0a5c416f5a37ca4b1e7cee360094940229bc97909aed25880326c72",
+ "sha256:6de4508ba8125e438a35bcd910d55df7087dccd3dd8517095c2bd9853c372ec1"
],
- "version": "==0.15.5"
+ "version": "==0.15.6"
},
"beautifulsoup4": {
"hashes": [
@@ -57,19 +57,19 @@
},
"boto3": {
"hashes": [
- "sha256:3c6cc4e9e38cf4523267f89eb90c0b6084fa415cb4f44e3bf0cad6199340cc92",
- "sha256:d28bcb98aee4d333b163c55b98341627d933dbf088832f7fc050893617be7dac"
+ "sha256:3b0fa19390895e664045713f2e47e63ad29c9f98b7bee6836dec7124953e48b8",
+ "sha256:9feb98e045736f943c2099d955415cfe44133e03d8e2d7581d2e5dc74d0ed064"
],
"index": "pypi",
- "version": "==1.24.92"
+ "version": "==1.26.1"
},
"botocore": {
"hashes": [
- "sha256:70cf2cb04968794ed4688cc3b07874f6f4c932e325611be4e693a995fdb481be",
- "sha256:b49c34b80c782625905be75e669da4b42a99f074e0aa3007e15ccc6955682a07"
+ "sha256:75c65130ffab527d0a3d948c6d87eb8eac210e079e1ff2768c66484be57bb77c",
+ "sha256:e38b7cdce927cefabe45608dde61660b76458fba6624240dcdb6c4b8453d17f7"
],
"markers": "python_version >= '3.7'",
- "version": "==1.27.92"
+ "version": "==1.29.1"
},
"brotli": {
"hashes": [
@@ -159,7 +159,7 @@
"sha256:84c85a9078b11105f04f3036a9482ae10e4621616db313fe045dd24743a0820d",
"sha256:fe86415d55e84719d75f8b69414f6438ac3547d2078ab91b67e779ef69378412"
],
- "markers": "python_full_version >= '3.6.0'",
+ "markers": "python_version >= '3.6'",
"version": "==2022.6.15"
},
"cffi": {
@@ -263,59 +263,59 @@
},
"cryptography": {
"hashes": [
- "sha256:0297ffc478bdd237f5ca3a7dc96fc0d315670bfa099c04dc3a4a2172008a405a",
- "sha256:10d1f29d6292fc95acb597bacefd5b9e812099d75a6469004fd38ba5471a977f",
- "sha256:16fa61e7481f4b77ef53991075de29fc5bacb582a1244046d2e8b4bb72ef66d0",
- "sha256:194044c6b89a2f9f169df475cc167f6157eb9151cc69af8a2a163481d45cc407",
- "sha256:1db3d807a14931fa317f96435695d9ec386be7b84b618cc61cfa5d08b0ae33d7",
- "sha256:3261725c0ef84e7592597606f6583385fed2a5ec3909f43bc475ade9729a41d6",
- "sha256:3b72c360427889b40f36dc214630e688c2fe03e16c162ef0aa41da7ab1455153",
- "sha256:3e3a2599e640927089f932295a9a247fc40a5bdf69b0484532f530471a382750",
- "sha256:3fc26e22840b77326a764ceb5f02ca2d342305fba08f002a8c1f139540cdfaad",
- "sha256:5067ee7f2bce36b11d0e334abcd1ccf8c541fc0bbdaf57cdd511fdee53e879b6",
- "sha256:52e7bee800ec869b4031093875279f1ff2ed12c1e2f74923e8f49c916afd1d3b",
- "sha256:64760ba5331e3f1794d0bcaabc0d0c39e8c60bf67d09c93dc0e54189dfd7cfe5",
- "sha256:765fa194a0f3372d83005ab83ab35d7c5526c4e22951e46059b8ac678b44fa5a",
- "sha256:79473cf8a5cbc471979bd9378c9f425384980fcf2ab6534b18ed7d0d9843987d",
- "sha256:896dd3a66959d3a5ddcfc140a53391f69ff1e8f25d93f0e2e7830c6de90ceb9d",
- "sha256:89ed49784ba88c221756ff4d4755dbc03b3c8d2c5103f6d6b4f83a0fb1e85294",
- "sha256:ac7e48f7e7261207d750fa7e55eac2d45f720027d5703cd9007e9b37bbb59ac0",
- "sha256:ad7353f6ddf285aeadfaf79e5a6829110106ff8189391704c1d8801aa0bae45a",
- "sha256:b0163a849b6f315bf52815e238bc2b2346604413fa7c1601eea84bcddb5fb9ac",
- "sha256:b6c9b706316d7b5a137c35e14f4103e2115b088c412140fdbd5f87c73284df61",
- "sha256:c2e5856248a416767322c8668ef1845ad46ee62629266f84a8f007a317141013",
- "sha256:ca9f6784ea96b55ff41708b92c3f6aeaebde4c560308e5fbbd3173fbc466e94e",
- "sha256:d1a5bd52d684e49a36582193e0b89ff267704cd4025abefb9e26803adeb3e5fb",
- "sha256:d3971e2749a723e9084dd507584e2a2761f78ad2c638aa31e80bc7a15c9db4f9",
- "sha256:d4ef6cc305394ed669d4d9eebf10d3a101059bdcf2669c366ec1d14e4fb227bd",
- "sha256:d9e69ae01f99abe6ad646947bba8941e896cb3aa805be2597a0400e0764b5818"
+ "sha256:068147f32fa662c81aebab95c74679b401b12b57494872886eb5c1139250ec5d",
+ "sha256:06fc3cc7b6f6cca87bd56ec80a580c88f1da5306f505876a71c8cfa7050257dd",
+ "sha256:25c1d1f19729fb09d42e06b4bf9895212292cb27bb50229f5aa64d039ab29146",
+ "sha256:402852a0aea73833d982cabb6d0c3bb582c15483d29fb7085ef2c42bfa7e38d7",
+ "sha256:4e269dcd9b102c5a3d72be3c45d8ce20377b8076a43cbed6f660a1afe365e436",
+ "sha256:5419a127426084933076132d317911e3c6eb77568a1ce23c3ac1e12d111e61e0",
+ "sha256:554bec92ee7d1e9d10ded2f7e92a5d70c1f74ba9524947c0ba0c850c7b011828",
+ "sha256:5e89468fbd2fcd733b5899333bc54d0d06c80e04cd23d8c6f3e0542358c6060b",
+ "sha256:65535bc550b70bd6271984d9863a37741352b4aad6fb1b3344a54e6950249b55",
+ "sha256:6ab9516b85bebe7aa83f309bacc5f44a61eeb90d0b4ec125d2d003ce41932d36",
+ "sha256:6addc3b6d593cd980989261dc1cce38263c76954d758c3c94de51f1e010c9a50",
+ "sha256:728f2694fa743a996d7784a6194da430f197d5c58e2f4e278612b359f455e4a2",
+ "sha256:785e4056b5a8b28f05a533fab69febf5004458e20dad7e2e13a3120d8ecec75a",
+ "sha256:78cf5eefac2b52c10398a42765bfa981ce2372cbc0457e6bf9658f41ec3c41d8",
+ "sha256:7f836217000342d448e1c9a342e9163149e45d5b5eca76a30e84503a5a96cab0",
+ "sha256:8d41a46251bf0634e21fac50ffd643216ccecfaf3701a063257fe0b2be1b6548",
+ "sha256:984fe150f350a3c91e84de405fe49e688aa6092b3525f407a18b9646f6612320",
+ "sha256:9b24bcff7853ed18a63cfb0c2b008936a9554af24af2fb146e16d8e1aed75748",
+ "sha256:b1b35d9d3a65542ed2e9d90115dfd16bbc027b3f07ee3304fc83580f26e43249",
+ "sha256:b1b52c9e5f8aa2b802d48bd693190341fae201ea51c7a167d69fc48b60e8a959",
+ "sha256:bbf203f1a814007ce24bd4d51362991d5cb90ba0c177a9c08825f2cc304d871f",
+ "sha256:be243c7e2bfcf6cc4cb350c0d5cdf15ca6383bbcb2a8ef51d3c9411a9d4386f0",
+ "sha256:bfbe6ee19615b07a98b1d2287d6a6073f734735b49ee45b11324d85efc4d5cbd",
+ "sha256:c46837ea467ed1efea562bbeb543994c2d1f6e800785bd5a2c98bc096f5cb220",
+ "sha256:dfb4f4dd568de1b6af9f4cda334adf7d72cf5bc052516e1b2608b683375dd95c",
+ "sha256:ed7b00096790213e09eb11c97cc6e2b757f15f3d2f85833cd2d3ec3fe37c1722"
],
- "markers": "python_full_version >= '3.6.0'",
- "version": "==38.0.1"
+ "markers": "python_version >= '3.6'",
+ "version": "==38.0.3"
},
"dataclasses-json": {
"hashes": [
"sha256:bc285b5f892094c3a53d558858a88553dd6a61a11ab1a8128a0e554385dcc5dd",
"sha256:c2c11bc8214fbf709ffc369d11446ff6945254a7f09128154a7620613d8fda90"
],
- "markers": "python_full_version >= '3.6.0'",
+ "markers": "python_version >= '3.6'",
"version": "==0.5.7"
},
"dateparser": {
"hashes": [
- "sha256:038196b1f12c7397e38aad3d61588833257f6f552baa63a1499e6987fa8d42d9",
- "sha256:9600874312ff28a41f96ec7ccdc73be1d1c44435719da47fea3339d55ff5a628"
+ "sha256:711f7eef6d431225bec56c00e386af3f6a47083276253375bdae1ae6c8d23d4a",
+ "sha256:ae7a7de30f26983d09fff802c1f9d35d54e1c11d7ab52ae904a1f3fc037ecba5"
],
"index": "pypi",
- "version": "==1.1.1"
+ "version": "==1.1.3"
},
"exceptiongroup": {
"hashes": [
- "sha256:2e3c3fc1538a094aab74fad52d6c33fc94de3dfee3ee01f187c0e0c72aec5337",
- "sha256:9086a4a21ef9b31c72181c77c040a074ba0889ee56a7b289ff0afb0d97655f96"
+ "sha256:2ac84b496be68464a2da60da518af3785fff8b7ec0d090a581604bc870bdee41",
+ "sha256:affbabf13fb6e98988c38d9c5650e701569fe3c1de3233cfb61c5f33774690ad"
],
"markers": "python_version < '3.11'",
- "version": "==1.0.0rc9"
+ "version": "==1.0.0"
},
"ffmpeg-python": {
"hashes": [
@@ -358,19 +358,19 @@
},
"google-api-python-client": {
"hashes": [
- "sha256:0dc4c967a5c795e981af01340f1bd22173a986534de968b5456cb208ed6775a6",
- "sha256:90545cd71969f8bcf15a6362c2a8c44c38b94ec35a88cfd60cf2c0df68a5eb74"
+ "sha256:2c6611530308b3f931dcf1360713aa3a20cf465d0bf2bac65f2ec99e8c9860de",
+ "sha256:b8a0ca8454ad57bc65199044717d3d214197ae1e2d666426bbcd4021b36762e0"
],
"index": "pypi",
- "version": "==2.64.0"
+ "version": "==2.65.0"
},
"google-auth": {
"hashes": [
- "sha256:9352dd6394093169157e6971526bab9a2799244d68a94a4a609f0dd751ef6f5e",
- "sha256:99510e664155f1a3c0396a076b5deb6367c52ea04d280152c85ac7f51f50eb42"
+ "sha256:1ad5b0e6eba5f69645971abb3d2c197537d5914070a8c6d30299dfdb07c5c700",
+ "sha256:cf24817855d874ede2efd071aa22125445f555de1685b739a9782fcf408c2a3d"
],
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4, 3.5'",
- "version": "==2.13.0"
+ "version": "==2.14.0"
},
"google-auth-httplib2": {
"hashes": [
@@ -382,11 +382,11 @@
},
"google-auth-oauthlib": {
"hashes": [
- "sha256:307d21918d61a0741882ad1fd001c67e68ad81206451d05fc4d26f79de56fc90",
- "sha256:9e8ff4ed2b21c174a2d6cc2172c698dbf0b1f686509774c663a83c495091fe09"
+ "sha256:53019edbde83e08ff0740eefc5bded7e26a289941d12e7ae1f0f5bacf2faa031",
+ "sha256:db11bce4b3effc99b518ec22a2903470e0853c0c92be57694e3684e738d22513"
],
"index": "pypi",
- "version": "==0.5.3"
+ "version": "==0.7.0"
},
"googleapis-common-protos": {
"hashes": [
@@ -398,11 +398,11 @@
},
"gspread": {
"hashes": [
- "sha256:0fe52bec73cc232abadfbc2a999e30201bc5cb0c2728ec00fcfdf38f6f669375",
- "sha256:9fca855173fdb2e648b3da9e7bbffb83601bfd7c7131d44fa781df84c689e7fc"
+ "sha256:41f7a416425f1ec5a1b677f49b8fbf599102766c27ed7be6601a58c9a1550ebc",
+ "sha256:d3bbff4b7aad0fc2c986458e148537a02fe7b46e7162f41f3a42392bfa2adb89"
],
"index": "pypi",
- "version": "==5.6.0"
+ "version": "==5.6.2"
},
"h11": {
"hashes": [
@@ -414,11 +414,11 @@
},
"httplib2": {
"hashes": [
- "sha256:58a98e45b4b1a48273073f905d2961666ecf0fbac4250ea5b47aef259eb5c585",
- "sha256:8b6a905cb1c79eefd03f8669fd993c36dc341f7c558f056cb5a33b5c2f458543"
+ "sha256:987c8bb3eb82d3fa60c68699510a692aa2ad9c4bd4f123e51dfb1488c14cdd01",
+ "sha256:fc144f091c7286b82bec71bdbd9b27323ba709cc612568d3000893bfd9cb4b34"
],
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'",
- "version": "==0.20.4"
+ "version": "==0.21.0"
},
"idna": {
"hashes": [
@@ -632,7 +632,7 @@
"sha256:8139f29aac13e25d502680e9e19963e83f16838d48a0d71c287fe40e7067fbca",
"sha256:9859c40929662bec5d64f34d01c99e093149682a3f38915dc0655d5a633dd918"
],
- "markers": "python_full_version >= '3.6.0'",
+ "markers": "python_version >= '3.6'",
"version": "==3.2.2"
},
"outcome": {
@@ -648,28 +648,28 @@
"sha256:dd47c42927d89ab911e606518907cc2d3a1f38bbd026385970643f9c5b8ecfeb",
"sha256:ef103e05f519cdc783ae24ea4e2e0f508a9c99b2d4969652eed6a2e1ea5bd522"
],
- "markers": "python_full_version >= '3.6.0'",
+ "markers": "python_version >= '3.6'",
"version": "==21.3"
},
"protobuf": {
"hashes": [
- "sha256:3ec85328a35a16463c6f419dbce3c0fc42b3e904d966f17f48bae39597c7a543",
- "sha256:58b81358ec6c0b5d50df761460ae2db58405c063fd415e1101209221a0a810e1",
- "sha256:71d9dba03ed3432c878a801e2ea51e034b0ea01cf3a4344fb60166cb5f6c8757",
- "sha256:8066322588d4b499869bf9f665ebe448e793036b552f68c585a9b28f1e393f66",
- "sha256:8e09d1916386eca1ef1353767b6efcebc0a6859ed7f73cb7fb974feba3184830",
- "sha256:9643684232b6b340b5e63bb69c9b4904cdd39e4303d498d1a92abddc7e895b7f",
- "sha256:9e355f2a839d9930d83971b9f562395e13493f0e9211520f8913bd11efa53c02",
- "sha256:a74d96cd960b87b4b712797c741bb3ea3a913f5c2dc4b6cbe9c0f8360b75297d",
- "sha256:b019c79e23a80735cc8a71b95f76a49a262f579d6b84fd20a0b82279f40e2cc1",
- "sha256:c7cb105d69a87416bd9023e64324e1c089593e6dae64d2536f06bcbe49cd97d8",
- "sha256:ca200645d6235ce0df3ccfdff1567acbab35c4db222a97357806e015f85b5744",
- "sha256:d3f89ccf7182293feba2de2739c8bf34fed1ed7c65a5cf987be00311acac57c1",
- "sha256:db9056b6a11cb5131036d734bcbf91ef3ef9235d6b681b2fc431cbfe5a7f2e56",
- "sha256:f370c0a71712f8965023dd5b13277444d3cdfecc96b2c778b0e19acbfd60df6e"
+ "sha256:2c9c2ed7466ad565f18668aa4731c535511c5d9a40c6da39524bccf43e441719",
+ "sha256:48e2cd6b88c6ed3d5877a3ea40df79d08374088e89bedc32557348848dff250b",
+ "sha256:5b0834e61fb38f34ba8840d7dcb2e5a2f03de0c714e0293b3963b79db26de8ce",
+ "sha256:61f21493d96d2a77f9ca84fefa105872550ab5ef71d21c458eb80edcf4885a99",
+ "sha256:6e0be9f09bf9b6cf497b27425487706fa48c6d1632ddd94dab1a5fe11a422392",
+ "sha256:6e312e280fbe3c74ea9e080d9e6080b636798b5e3939242298b591064470b06b",
+ "sha256:7eb8f2cc41a34e9c956c256e3ac766cf4e1a4c9c925dc757a41a01be3e852965",
+ "sha256:84ea107016244dfc1eecae7684f7ce13c788b9a644cd3fca5b77871366556444",
+ "sha256:9227c14010acd9ae7702d6467b4625b6fe853175a6b150e539b21d2b2f2b409c",
+ "sha256:a419cc95fca8694804709b8c4f2326266d29659b126a93befe210f5bbc772536",
+ "sha256:a7d0ea43949d45b836234f4ebb5ba0b22e7432d065394b532cdca8f98415e3cf",
+ "sha256:b5ab0b8918c136345ff045d4b3d5f719b505b7c8af45092d7f45e304f55e50a1",
+ "sha256:e575c57dc8b5b2b2caa436c16d44ef6981f2235eb7179bfc847557886376d740",
+ "sha256:f9eae277dd240ae19bb06ff4e2346e771252b0e619421965504bd1b1bba7c5fa"
],
"markers": "python_version >= '3.7'",
- "version": "==4.21.7"
+ "version": "==4.21.9"
},
"pyaes": {
"hashes": [
@@ -761,7 +761,7 @@
"sha256:56a8508ae95f98e2b9bdf93a6be5ae3f7d8af858b43e02c5a2ff083726be40c1",
"sha256:f643f331ab57ba3c9d89212ee4a2dabc6e94f117cf4eefde99a0574720d14c42"
],
- "markers": "python_full_version >= '3.6.0'",
+ "markers": "python_version >= '3.6'",
"version": "==2.13.0"
},
"pyparsing": {
@@ -814,9 +814,10 @@
},
"pytz": {
"hashes": [
- "sha256:335ab46900b1465e714b4fda4963d87363264eb662aab5e65da039c25f1f5b22"
+ "sha256:222439474e9c98fced559f1709d89e6c9cbf8d79c794ff3eb9f8800064291427",
+ "sha256:e89512406b793ca39f5971bc999cc538ce125c0e51c27941bef4568b460095e2"
],
- "version": "==2022.5"
+ "version": "==2022.6"
},
"pytz-deprecation-shim": {
"hashes": [
@@ -949,7 +950,7 @@
"sha256:fbc88d3ba402b5d041d204ec2449c4078898f89c4a6e6f0ed1c1a510ef1e221d",
"sha256:fbd3fe37353c62fd0eb19fb76f78aa693716262bcd5f9c14bb9e5aca4b3f0dc4"
],
- "markers": "python_full_version >= '3.6.0'",
+ "markers": "python_version >= '3.6'",
"version": "==2022.3.2"
},
"requests": {
@@ -970,11 +971,11 @@
},
"requests-toolbelt": {
"hashes": [
- "sha256:64c6b8c51b515d123f9f708a29743f44eb70c4479440641ed2df8c4dea56d985",
- "sha256:f695d6207931200b46c8ef6addbc8a921fb5d77cc4cd209c2e7d39293fcd2b30"
+ "sha256:18565aa58116d9951ac39baa288d3adb5b3ff975c4f25eee78555d89e8f247f7",
+ "sha256:62e09f7ff5ccbda92772a29f394a49c3ad6cb181d568b1337626b2abb628a63d"
],
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'",
- "version": "==0.10.0"
+ "version": "==0.10.1"
},
"rich": {
"hashes": [
@@ -989,7 +990,7 @@
"sha256:90260d9058e514786967344d0ef75fa8727eed8a7d2e43ce9f4bcf1b536174f7",
"sha256:e38464a49c6c85d7f1351b0126661487a7e0a14a50f1675ec50eb34d4f20ef21"
],
- "markers": "python_version < '4' and python_full_version >= '3.6.0'",
+ "markers": "python_version >= '3.6' and python_version < '4'",
"version": "==4.9"
},
"s3transfer": {
@@ -1043,7 +1044,7 @@
"sha256:3b2503d3c7084a42b1ebd08116e5f81aadfaea95863628c80a3b774a11b7c759",
"sha256:fc53893b3da2c33de295667a0e19f078c14bf86544af307354de5fcf12a3f30d"
],
- "markers": "python_full_version >= '3.6.0'",
+ "markers": "python_version >= '3.6'",
"version": "==2.3.2.post1"
},
"telethon": {
@@ -1109,18 +1110,18 @@
},
"tzdata": {
"hashes": [
- "sha256:323161b22b7802fdc78f20ca5f6073639c64f1a7227c40cd3e19fd1d0ce6650a",
- "sha256:e15b2b3005e2546108af42a0eb4ccab4d9e225e2dfbf4f77aad50c70a4b1f3ab"
+ "sha256:04a680bdc5b15750c39c12a448885a51134a27ec9af83667663f0b3a1bf3f342",
+ "sha256:91f11db4503385928c15598c98573e3af07e7229181bee5375bd30f1695ddcae"
],
- "markers": "python_full_version >= '3.6.0'",
- "version": "==2022.5"
+ "markers": "python_version >= '3.6'",
+ "version": "==2022.6"
},
"tzlocal": {
"hashes": [
"sha256:89885494684c929d9191c57aa27502afc87a579be5cdd3225c77c463ea043745",
"sha256:ee5842fa3a795f023514ac2d801c4a81d1743bbe642e3940143326b3a00addd7"
],
- "markers": "python_full_version >= '3.6.0'",
+ "markers": "python_version >= '3.6'",
"version": "==4.2"
},
"uritemplate": {
@@ -1128,7 +1129,7 @@
"sha256:4346edfc5c3b79f694bccd6d6099a322bbeb628dbf2cd86eea55a456ce5124f0",
"sha256:830c08b8d99bdd312ea4ead05994a38e8936266f84b9a7878232db50b044e02e"
],
- "markers": "python_full_version >= '3.6.0'",
+ "markers": "python_version >= '3.6'",
"version": "==4.1.1"
},
"urllib3": {
@@ -1148,65 +1149,86 @@
},
"vk-url-scraper": {
"hashes": [
- "sha256:7caf8d788fc268d311b13c06ff0cbd9413dd8978f463af970459b9e7e2f42ba5",
- "sha256:c4593d86b5096e75e2845e4838f46ce2cf0ac34b2fe1c4476d2eeb6744b18a11"
+ "sha256:3718a569e431c9c2bc7e92e9156e25b7112dc0b9b461c8001fa481a00ccbd3bc",
+ "sha256:baebe32bb29d6f188d849f38ecc43d04d5b5bad05db7f31dfdbe450f684042f0"
],
"index": "pypi",
- "version": "==0.3.5"
+ "version": "==0.3.8"
},
"websockets": {
"hashes": [
- "sha256:07cdc0a5b2549bcfbadb585ad8471ebdc7bdf91e32e34ae3889001c1c106a6af",
- "sha256:210aad7fdd381c52e58777560860c7e6110b6174488ef1d4b681c08b68bf7f8c",
- "sha256:28dd20b938a57c3124028680dc1600c197294da5db4292c76a0b48efb3ed7f76",
- "sha256:2f94fa3ae454a63ea3a19f73b95deeebc9f02ba2d5617ca16f0bbdae375cda47",
- "sha256:31564a67c3e4005f27815634343df688b25705cccb22bc1db621c781ddc64c69",
- "sha256:347974105bbd4ea068106ec65e8e8ebd86f28c19e529d115d89bd8cc5cda3079",
- "sha256:379e03422178436af4f3abe0aa8f401aa77ae2487843738542a75faf44a31f0c",
- "sha256:3eda1cb7e9da1b22588cefff09f0951771d6ee9fa8dbe66f5ae04cc5f26b2b55",
- "sha256:51695d3b199cd03098ae5b42833006a0f43dc5418d3102972addc593a783bc02",
- "sha256:54c000abeaff6d8771a4e2cef40900919908ea7b6b6a30eae72752607c6db559",
- "sha256:5b936bf552e4f6357f5727579072ff1e1324717902127ffe60c92d29b67b7be3",
- "sha256:6075fd24df23133c1b078e08a9b04a3bc40b31a8def4ee0b9f2c8865acce913e",
- "sha256:661f641b44ed315556a2fa630239adfd77bd1b11cb0b9d96ed8ad90b0b1e4978",
- "sha256:6ea6b300a6bdd782e49922d690e11c3669828fe36fc2471408c58b93b5535a98",
- "sha256:6ed1d6f791eabfd9808afea1e068f5e59418e55721db8b7f3bfc39dc831c42ae",
- "sha256:7934e055fd5cd9dee60f11d16c8d79c4567315824bacb1246d0208a47eca9755",
- "sha256:7ab36e17af592eec5747c68ef2722a74c1a4a70f3772bc661079baf4ae30e40d",
- "sha256:7f6d96fdb0975044fdd7953b35d003b03f9e2bcf85f2d2cf86285ece53e9f991",
- "sha256:83e5ca0d5b743cde3d29fda74ccab37bdd0911f25bd4cdf09ff8b51b7b4f2fa1",
- "sha256:85506b3328a9e083cc0a0fb3ba27e33c8db78341b3eb12eb72e8afd166c36680",
- "sha256:8af75085b4bc0b5c40c4a3c0e113fa95e84c60f4ed6786cbb675aeb1ee128247",
- "sha256:8b1359aba0ff810d5830d5ab8e2c4a02bebf98a60aa0124fb29aa78cfdb8031f",
- "sha256:8fbd7d77f8aba46d43245e86dd91a8970eac4fb74c473f8e30e9c07581f852b2",
- "sha256:907e8247480f287aa9bbc9391bd6de23c906d48af54c8c421df84655eef66af7",
- "sha256:93d5ea0b5da8d66d868b32c614d2b52d14304444e39e13a59566d4acb8d6e2e4",
- "sha256:97bc9d41e69a7521a358f9b8e44871f6cdeb42af31815c17aed36372d4eec667",
- "sha256:994cdb1942a7a4c2e10098d9162948c9e7b235df755de91ca33f6e0481366fdb",
- "sha256:a141de3d5a92188234afa61653ed0bbd2dde46ad47b15c3042ffb89548e77094",
- "sha256:a1e15b230c3613e8ea82c9fc6941b2093e8eb939dd794c02754d33980ba81e36",
- "sha256:aad5e300ab32036eb3fdc350ad30877210e2f51bceaca83fb7fef4d2b6c72b79",
- "sha256:b529fdfa881b69fe563dbd98acce84f3e5a67df13de415e143ef053ff006d500",
- "sha256:b9c77f0d1436ea4b4dc089ed8335fa141e6a251a92f75f675056dac4ab47a71e",
- "sha256:bb621ec2dbbbe8df78a27dbd9dd7919f9b7d32a73fafcb4d9252fc4637343582",
- "sha256:c7250848ce69559756ad0086a37b82c986cd33c2d344ab87fea596c5ac6d9442",
- "sha256:c8d1d14aa0f600b5be363077b621b1b4d1eb3fbf90af83f9281cda668e6ff7fd",
- "sha256:d1655a6fc7aecd333b079d00fb3c8132d18988e47f19740c69303bf02e9883c6",
- "sha256:d6353ba89cfc657a3f5beabb3b69be226adbb5c6c7a66398e17809b0ce3c4731",
- "sha256:da4377904a3379f0c1b75a965fff23b28315bcd516d27f99a803720dfebd94d4",
- "sha256:e49ea4c1a9543d2bd8a747ff24411509c29e4bdcde05b5b0895e2120cb1a761d",
- "sha256:e4e08305bfd76ba8edab08dcc6496f40674f44eb9d5e23153efa0a35750337e8",
- "sha256:e6fa05a680e35d0fcc1470cb070b10e6fe247af54768f488ed93542e71339d6f",
- "sha256:e7e6f2d6fd48422071cc8a6f8542016f350b79cc782752de531577d35e9bd677",
- "sha256:e904c0381c014b914136c492c8fa711ca4cced4e9b3d110e5e7d436d0fc289e8",
- "sha256:ec2b0ab7edc8cd4b0eb428b38ed89079bdc20c6bdb5f889d353011038caac2f9",
- "sha256:ef5ce841e102278c1c2e98f043db99d6755b1c58bde475516aef3a008ed7f28e",
- "sha256:f351c7d7d92f67c0609329ab2735eee0426a03022771b00102816a72715bb00b",
- "sha256:fab7c640815812ed5f10fbee7abbf58788d602046b7bb3af9b1ac753a6d5e916",
- "sha256:fc06cc8073c8e87072138ba1e431300e2d408f054b27047d047b549455066ff4"
+ "sha256:00213676a2e46b6ebf6045bc11d0f529d9120baa6f58d122b4021ad92adabd41",
+ "sha256:00c870522cdb69cd625b93f002961ffb0c095394f06ba8c48f17eef7c1541f96",
+ "sha256:0154f7691e4fe6c2b2bc275b5701e8b158dae92a1ab229e2b940efe11905dff4",
+ "sha256:05a7233089f8bd355e8cbe127c2e8ca0b4ea55467861906b80d2ebc7db4d6b72",
+ "sha256:09a1814bb15eff7069e51fed0826df0bc0702652b5cb8f87697d469d79c23576",
+ "sha256:0cff816f51fb33c26d6e2b16b5c7d48eaa31dae5488ace6aae468b361f422b63",
+ "sha256:185929b4808b36a79c65b7865783b87b6841e852ef5407a2fb0c03381092fa3b",
+ "sha256:2fc8709c00704194213d45e455adc106ff9e87658297f72d544220e32029cd3d",
+ "sha256:33d69ca7612f0ddff3316b0c7b33ca180d464ecac2d115805c044bf0a3b0d032",
+ "sha256:389f8dbb5c489e305fb113ca1b6bdcdaa130923f77485db5b189de343a179393",
+ "sha256:38ea7b82bfcae927eeffc55d2ffa31665dc7fec7b8dc654506b8e5a518eb4d50",
+ "sha256:3d3cac3e32b2c8414f4f87c1b2ab686fa6284a980ba283617404377cd448f631",
+ "sha256:40e826de3085721dabc7cf9bfd41682dadc02286d8cf149b3ad05bff89311e4f",
+ "sha256:4239b6027e3d66a89446908ff3027d2737afc1a375f8fd3eea630a4842ec9a0c",
+ "sha256:45ec8e75b7dbc9539cbfafa570742fe4f676eb8b0d3694b67dabe2f2ceed8aa6",
+ "sha256:47a2964021f2110116cc1125b3e6d87ab5ad16dea161949e7244ec583b905bb4",
+ "sha256:48c08473563323f9c9debac781ecf66f94ad5a3680a38fe84dee5388cf5acaf6",
+ "sha256:4c6d2264f485f0b53adf22697ac11e261ce84805c232ed5dbe6b1bcb84b00ff0",
+ "sha256:4f72e5cd0f18f262f5da20efa9e241699e0cf3a766317a17392550c9ad7b37d8",
+ "sha256:56029457f219ade1f2fc12a6504ea61e14ee227a815531f9738e41203a429112",
+ "sha256:5c1289596042fad2cdceb05e1ebf7aadf9995c928e0da2b7a4e99494953b1b94",
+ "sha256:62e627f6b6d4aed919a2052efc408da7a545c606268d5ab5bfab4432734b82b4",
+ "sha256:74de2b894b47f1d21cbd0b37a5e2b2392ad95d17ae983e64727e18eb281fe7cb",
+ "sha256:7c584f366f46ba667cfa66020344886cf47088e79c9b9d39c84ce9ea98aaa331",
+ "sha256:7d27a7e34c313b3a7f91adcd05134315002aaf8540d7b4f90336beafaea6217c",
+ "sha256:7d3f0b61c45c3fa9a349cf484962c559a8a1d80dae6977276df8fd1fa5e3cb8c",
+ "sha256:82ff5e1cae4e855147fd57a2863376ed7454134c2bf49ec604dfe71e446e2193",
+ "sha256:84bc2a7d075f32f6ed98652db3a680a17a4edb21ca7f80fe42e38753a58ee02b",
+ "sha256:884be66c76a444c59f801ac13f40c76f176f1bfa815ef5b8ed44321e74f1600b",
+ "sha256:8a5cc00546e0a701da4639aa0bbcb0ae2bb678c87f46da01ac2d789e1f2d2038",
+ "sha256:8dc96f64ae43dde92530775e9cb169979f414dcf5cff670455d81a6823b42089",
+ "sha256:8f38706e0b15d3c20ef6259fd4bc1700cd133b06c3c1bb108ffe3f8947be15fa",
+ "sha256:90fcf8929836d4a0e964d799a58823547df5a5e9afa83081761630553be731f9",
+ "sha256:931c039af54fc195fe6ad536fde4b0de04da9d5916e78e55405436348cfb0e56",
+ "sha256:932af322458da7e4e35df32f050389e13d3d96b09d274b22a7aa1808f292fee4",
+ "sha256:942de28af58f352a6f588bc72490ae0f4ccd6dfc2bd3de5945b882a078e4e179",
+ "sha256:9bc42e8402dc5e9905fb8b9649f57efcb2056693b7e88faa8fb029256ba9c68c",
+ "sha256:a7a240d7a74bf8d5cb3bfe6be7f21697a28ec4b1a437607bae08ac7acf5b4882",
+ "sha256:a9f9a735deaf9a0cadc2d8c50d1a5bcdbae8b6e539c6e08237bc4082d7c13f28",
+ "sha256:ae5e95cfb53ab1da62185e23b3130e11d64431179debac6dc3c6acf08760e9b1",
+ "sha256:b029fb2032ae4724d8ae8d4f6b363f2cc39e4c7b12454df8df7f0f563ed3e61a",
+ "sha256:b0d15c968ea7a65211e084f523151dbf8ae44634de03c801b8bd070b74e85033",
+ "sha256:b343f521b047493dc4022dd338fc6db9d9282658862756b4f6fd0e996c1380e1",
+ "sha256:b627c266f295de9dea86bd1112ed3d5fafb69a348af30a2422e16590a8ecba13",
+ "sha256:b9968694c5f467bf67ef97ae7ad4d56d14be2751000c1207d31bf3bb8860bae8",
+ "sha256:ba089c499e1f4155d2a3c2a05d2878a3428cf321c848f2b5a45ce55f0d7d310c",
+ "sha256:bbccd847aa0c3a69b5f691a84d2341a4f8a629c6922558f2a70611305f902d74",
+ "sha256:bc0b82d728fe21a0d03e65f81980abbbcb13b5387f733a1a870672c5be26edab",
+ "sha256:c57e4c1349fbe0e446c9fa7b19ed2f8a4417233b6984277cce392819123142d3",
+ "sha256:c94ae4faf2d09f7c81847c63843f84fe47bf6253c9d60b20f25edfd30fb12588",
+ "sha256:c9b27d6c1c6cd53dc93614967e9ce00ae7f864a2d9f99fe5ed86706e1ecbf485",
+ "sha256:d210abe51b5da0ffdbf7b43eed0cfdff8a55a1ab17abbec4301c9ff077dd0342",
+ "sha256:d58804e996d7d2307173d56c297cf7bc132c52df27a3efaac5e8d43e36c21c48",
+ "sha256:d6a4162139374a49eb18ef5b2f4da1dd95c994588f5033d64e0bbfda4b6b6fcf",
+ "sha256:da39dd03d130162deb63da51f6e66ed73032ae62e74aaccc4236e30edccddbb0",
+ "sha256:db3c336f9eda2532ec0fd8ea49fef7a8df8f6c804cdf4f39e5c5c0d4a4ad9a7a",
+ "sha256:dd500e0a5e11969cdd3320935ca2ff1e936f2358f9c2e61f100a1660933320ea",
+ "sha256:dd9becd5fe29773d140d68d607d66a38f60e31b86df75332703757ee645b6faf",
+ "sha256:e0cb5cc6ece6ffa75baccfd5c02cffe776f3f5c8bf486811f9d3ea3453676ce8",
+ "sha256:e23173580d740bf8822fd0379e4bf30aa1d5a92a4f252d34e893070c081050df",
+ "sha256:e3a686ecb4aa0d64ae60c9c9f1a7d5d46cab9bfb5d91a2d303d00e2cd4c4c5cc",
+ "sha256:e789376b52c295c4946403bd0efecf27ab98f05319df4583d3c48e43c7342c2f",
+ "sha256:edc344de4dac1d89300a053ac973299e82d3db56330f3494905643bb68801269",
+ "sha256:eef610b23933c54d5d921c92578ae5f89813438fded840c2e9809d378dc765d3",
+ "sha256:f2c38d588887a609191d30e902df2a32711f708abfd85d318ca9b367258cfd0c",
+ "sha256:f55b5905705725af31ccef50e55391621532cd64fbf0bc6f4bac935f0fccec46",
+ "sha256:f5fc088b7a32f244c519a048c170f14cf2251b849ef0e20cbbb0fdf0fdaf556f",
+ "sha256:fe10ddc59b304cb19a1bdf5bd0a7719cbbc9fbdd57ac80ed436b709fcf889106",
+ "sha256:ff64a1d38d156d429404aaa84b27305e957fd10c30e5880d1765c9480bea490f"
],
"markers": "python_version >= '3.7'",
- "version": "==10.3"
+ "version": "==10.4"
},
"werkzeug": {
"hashes": [
@@ -1226,21 +1248,21 @@
},
"yt-dlp": {
"hashes": [
- "sha256:3a7b59d2fb4b39ce8ba8e0b9c5a37fe20e5624f46a2346b4ae66ab1320e35134",
- "sha256:deec1009442312c1e2ee5298966842194d0e950b433f0d4fc844ef464b9c32a7"
+ "sha256:0e7b81fc6ac8d1b7d3fffa79f9044ca4163784422582c9a3593305da2a69ec02",
+ "sha256:d7d1f81d230756f094b4d9ee59b37b2c13b2e63ff5fb72cda53625edb072cdae"
],
"index": "pypi",
- "version": "==2022.5.18"
+ "version": "==2022.7.18"
}
},
"develop": {
"autopep8": {
"hashes": [
- "sha256:6f09e90a2be784317e84dc1add17ebfc7abe3924239957a37e5040e27d812087",
- "sha256:ca9b1a83e53a7fad65d731dc7a2a2d50aa48f43850407c59f6a1a306c4201142"
+ "sha256:8b1659c7f003e693199f52caffdc06585bb0716900bbc6a7442fd931d658c077",
+ "sha256:ad924b42c2e27a1ac58e432166cc4588f5b80747de02d0d35b1ecbd3e7d57207"
],
"index": "pypi",
- "version": "==1.7.0"
+ "version": "==2.0.0"
},
"pycodestyle": {
"hashes": [
@@ -1250,13 +1272,13 @@
"markers": "python_version >= '3.6'",
"version": "==2.9.1"
},
- "toml": {
+ "tomli": {
"hashes": [
- "sha256:806143ae5bfb6a3c6e736a764057db0e6a0e05e338b5630894a5f779cabb4f9b",
- "sha256:b3bda1d108d5dd99f4a20d24d9c348e91c4db7ab1b749200bded2f839ccbe68f"
+ "sha256:939de3e7a6161af0c887ef91b7d41a53e7c5a1ca976325f429cb46ea9bc30ecc",
+ "sha256:de526c12914f0c550d15924c62d72abc48d6fe7364aa87328337a31007fe8a4f"
],
- "markers": "python_version >= '2.6' and python_version not in '3.0, 3.1, 3.2, 3.3'",
- "version": "==0.10.2"
+ "markers": "python_version >= '3.7'",
+ "version": "==2.0.1"
}
}
}
diff --git a/archivers/base_archiver.py b/archivers/base_archiver.py
index 3dc5ba1..7037e4d 100644
--- a/archivers/base_archiver.py
+++ b/archivers/base_archiver.py
@@ -244,8 +244,13 @@ class Archiver(ABC):
filename = os.path.join(browsertrix_home, "collections", collection, f"{collection}.wacz")
- self.storage.upload(filename, key, extra_args={
+ # do not crash if upload fails
+ try:
+ self.storage.upload(filename, key, extra_args={
'ACL': 'public-read', 'ContentType': 'application/zip'})
+ except FileNotFoundError as e:
+ logger.warning(f"Unable to locate and upload WACZ {filename=}, {key=}")
+
# clean up the local browsertrix files
try:
From 81eadd46720e384c0f598fba0f580eaee346a4f1 Mon Sep 17 00:00:00 2001
From: msramalho <19508417+msramalho@users.noreply.github.com>
Date: Tue, 8 Nov 2022 14:22:13 +0000
Subject: [PATCH 017/190] disable browsertrix on docker, see #66
---
archivers/base_archiver.py | 6 ++++++
1 file changed, 6 insertions(+)
diff --git a/archivers/base_archiver.py b/archivers/base_archiver.py
index 7037e4d..5ef2b7e 100644
--- a/archivers/base_archiver.py
+++ b/archivers/base_archiver.py
@@ -37,6 +37,7 @@ class Archiver(ABC):
self.driver = config.webdriver
self.hash_algorithm = config.hash_algorithm
self.browsertrix = config.browsertrix_config
+ self.is_docker = config.is_docker
def __str__(self):
return self.__class__.__name__
@@ -206,6 +207,11 @@ class Archiver(ABC):
if not self.browsertrix.enabled:
logger.debug(f"Browsertrix WACZ generation is not enabled, skipping.")
return
+ if self.is_docker:
+ # TODO: figure out support for browsertrix in docker
+ # see: https://github.com/bellingcat/auto-archiver/issues/66
+ logger.warning(f"Browsertrix WACZ is not yet supported when using DOCKER.")
+ return
logger.debug(f"getting wacz for {url}")
key = self._get_key_from_url(url, ".wacz", append_datetime=True)
From 390b84eb22393a40b44aaa69ae3b1f8432b752d8 Mon Sep 17 00:00:00 2001
From: msramalho <19508417+msramalho@users.noreply.github.com>
Date: Tue, 8 Nov 2022 15:55:33 +0000
Subject: [PATCH 018/190] dockerization complete
---
Dockerfile | 7 +++----
src/__init__.py | 0
{archivers => src/archivers}/__init__.py | 0
{archivers => src/archivers}/base_archiver.py | 0
{archivers => src/archivers}/instagram_archiver.py | 0
{archivers => src/archivers}/telegram_archiver.py | 0
{archivers => src/archivers}/telethon_archiver.py | 0
{archivers => src/archivers}/tiktok_archiver.py | 0
{archivers => src/archivers}/twitter_api_archiver.py | 0
{archivers => src/archivers}/twitter_archiver.py | 0
{archivers => src/archivers}/vk_archiver.py | 0
{archivers => src/archivers}/wayback_archiver.py | 0
{archivers => src/archivers}/youtubedl_archiver.py | 0
auto_archive.py => src/auto_archive.py | 0
auto_auto_archive.py => src/auto_auto_archive.py | 0
src/cli.py | 0
{configs => src/configs}/__init__.py | 0
{configs => src/configs}/browsertrix_config.py | 0
{configs => src/configs}/config.py | 0
{configs => src/configs}/instagram_config.py | 0
{configs => src/configs}/selenium_config.py | 0
{configs => src/configs}/telethon_config.py | 0
{configs => src/configs}/twitter_api_config.py | 0
{configs => src/configs}/vk_config.py | 0
{configs => src/configs}/wayback_config.py | 0
{storages => src/storages}/__init__.py | 0
{storages => src/storages}/base_storage.py | 0
{storages => src/storages}/gd_storage.py | 0
{storages => src/storages}/local_storage.py | 0
{storages => src/storages}/s3_storage.py | 0
{utils => src/utils}/__init__.py | 0
{utils => src/utils}/gworksheet.py | 0
{utils => src/utils}/misc.py | 0
33 files changed, 3 insertions(+), 4 deletions(-)
create mode 100644 src/__init__.py
rename {archivers => src/archivers}/__init__.py (100%)
rename {archivers => src/archivers}/base_archiver.py (100%)
rename {archivers => src/archivers}/instagram_archiver.py (100%)
rename {archivers => src/archivers}/telegram_archiver.py (100%)
rename {archivers => src/archivers}/telethon_archiver.py (100%)
rename {archivers => src/archivers}/tiktok_archiver.py (100%)
rename {archivers => src/archivers}/twitter_api_archiver.py (100%)
rename {archivers => src/archivers}/twitter_archiver.py (100%)
rename {archivers => src/archivers}/vk_archiver.py (100%)
rename {archivers => src/archivers}/wayback_archiver.py (100%)
rename {archivers => src/archivers}/youtubedl_archiver.py (100%)
rename auto_archive.py => src/auto_archive.py (100%)
rename auto_auto_archive.py => src/auto_auto_archive.py (100%)
create mode 100644 src/cli.py
rename {configs => src/configs}/__init__.py (100%)
rename {configs => src/configs}/browsertrix_config.py (100%)
rename {configs => src/configs}/config.py (100%)
rename {configs => src/configs}/instagram_config.py (100%)
rename {configs => src/configs}/selenium_config.py (100%)
rename {configs => src/configs}/telethon_config.py (100%)
rename {configs => src/configs}/twitter_api_config.py (100%)
rename {configs => src/configs}/vk_config.py (100%)
rename {configs => src/configs}/wayback_config.py (100%)
rename {storages => src/storages}/__init__.py (100%)
rename {storages => src/storages}/base_storage.py (100%)
rename {storages => src/storages}/gd_storage.py (100%)
rename {storages => src/storages}/local_storage.py (100%)
rename {storages => src/storages}/s3_storage.py (100%)
rename {utils => src/utils}/__init__.py (100%)
rename {utils => src/utils}/gworksheet.py (100%)
rename {utils => src/utils}/misc.py (100%)
diff --git a/Dockerfile b/Dockerfile
index a9b4d7a..5db284a 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -15,16 +15,15 @@ RUN pip install --upgrade pip && \
# install docker for WACZ
-RUN curl -fsSL https://get.docker.com | sh
+# TODO: currently disabled see https://github.com/bellingcat/auto-archiver/issues/66
+# RUN curl -fsSL https://get.docker.com | sh
# RUN git clone https://github.com/bellingcat/auto-archiver
# TODO: avoid copying unnecessary files, including .git
-# COPY ./src/ .
COPY Pipfile Pipfile.lock ./
RUN pipenv install --python=3.10 --system --deploy
-# TODO: to avoid copying pipfile lock it should be on the .dockerignore
ENV IS_DOCKER=1
-COPY . .
+COPY ./src/ .
# CMD ["pipenv", "run", "python", "auto_archive.py"]
ENTRYPOINT ["python", "auto_archive.py"]
diff --git a/src/__init__.py b/src/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/archivers/__init__.py b/src/archivers/__init__.py
similarity index 100%
rename from archivers/__init__.py
rename to src/archivers/__init__.py
diff --git a/archivers/base_archiver.py b/src/archivers/base_archiver.py
similarity index 100%
rename from archivers/base_archiver.py
rename to src/archivers/base_archiver.py
diff --git a/archivers/instagram_archiver.py b/src/archivers/instagram_archiver.py
similarity index 100%
rename from archivers/instagram_archiver.py
rename to src/archivers/instagram_archiver.py
diff --git a/archivers/telegram_archiver.py b/src/archivers/telegram_archiver.py
similarity index 100%
rename from archivers/telegram_archiver.py
rename to src/archivers/telegram_archiver.py
diff --git a/archivers/telethon_archiver.py b/src/archivers/telethon_archiver.py
similarity index 100%
rename from archivers/telethon_archiver.py
rename to src/archivers/telethon_archiver.py
diff --git a/archivers/tiktok_archiver.py b/src/archivers/tiktok_archiver.py
similarity index 100%
rename from archivers/tiktok_archiver.py
rename to src/archivers/tiktok_archiver.py
diff --git a/archivers/twitter_api_archiver.py b/src/archivers/twitter_api_archiver.py
similarity index 100%
rename from archivers/twitter_api_archiver.py
rename to src/archivers/twitter_api_archiver.py
diff --git a/archivers/twitter_archiver.py b/src/archivers/twitter_archiver.py
similarity index 100%
rename from archivers/twitter_archiver.py
rename to src/archivers/twitter_archiver.py
diff --git a/archivers/vk_archiver.py b/src/archivers/vk_archiver.py
similarity index 100%
rename from archivers/vk_archiver.py
rename to src/archivers/vk_archiver.py
diff --git a/archivers/wayback_archiver.py b/src/archivers/wayback_archiver.py
similarity index 100%
rename from archivers/wayback_archiver.py
rename to src/archivers/wayback_archiver.py
diff --git a/archivers/youtubedl_archiver.py b/src/archivers/youtubedl_archiver.py
similarity index 100%
rename from archivers/youtubedl_archiver.py
rename to src/archivers/youtubedl_archiver.py
diff --git a/auto_archive.py b/src/auto_archive.py
similarity index 100%
rename from auto_archive.py
rename to src/auto_archive.py
diff --git a/auto_auto_archive.py b/src/auto_auto_archive.py
similarity index 100%
rename from auto_auto_archive.py
rename to src/auto_auto_archive.py
diff --git a/src/cli.py b/src/cli.py
new file mode 100644
index 0000000..e69de29
diff --git a/configs/__init__.py b/src/configs/__init__.py
similarity index 100%
rename from configs/__init__.py
rename to src/configs/__init__.py
diff --git a/configs/browsertrix_config.py b/src/configs/browsertrix_config.py
similarity index 100%
rename from configs/browsertrix_config.py
rename to src/configs/browsertrix_config.py
diff --git a/configs/config.py b/src/configs/config.py
similarity index 100%
rename from configs/config.py
rename to src/configs/config.py
diff --git a/configs/instagram_config.py b/src/configs/instagram_config.py
similarity index 100%
rename from configs/instagram_config.py
rename to src/configs/instagram_config.py
diff --git a/configs/selenium_config.py b/src/configs/selenium_config.py
similarity index 100%
rename from configs/selenium_config.py
rename to src/configs/selenium_config.py
diff --git a/configs/telethon_config.py b/src/configs/telethon_config.py
similarity index 100%
rename from configs/telethon_config.py
rename to src/configs/telethon_config.py
diff --git a/configs/twitter_api_config.py b/src/configs/twitter_api_config.py
similarity index 100%
rename from configs/twitter_api_config.py
rename to src/configs/twitter_api_config.py
diff --git a/configs/vk_config.py b/src/configs/vk_config.py
similarity index 100%
rename from configs/vk_config.py
rename to src/configs/vk_config.py
diff --git a/configs/wayback_config.py b/src/configs/wayback_config.py
similarity index 100%
rename from configs/wayback_config.py
rename to src/configs/wayback_config.py
diff --git a/storages/__init__.py b/src/storages/__init__.py
similarity index 100%
rename from storages/__init__.py
rename to src/storages/__init__.py
diff --git a/storages/base_storage.py b/src/storages/base_storage.py
similarity index 100%
rename from storages/base_storage.py
rename to src/storages/base_storage.py
diff --git a/storages/gd_storage.py b/src/storages/gd_storage.py
similarity index 100%
rename from storages/gd_storage.py
rename to src/storages/gd_storage.py
diff --git a/storages/local_storage.py b/src/storages/local_storage.py
similarity index 100%
rename from storages/local_storage.py
rename to src/storages/local_storage.py
diff --git a/storages/s3_storage.py b/src/storages/s3_storage.py
similarity index 100%
rename from storages/s3_storage.py
rename to src/storages/s3_storage.py
diff --git a/utils/__init__.py b/src/utils/__init__.py
similarity index 100%
rename from utils/__init__.py
rename to src/utils/__init__.py
diff --git a/utils/gworksheet.py b/src/utils/gworksheet.py
similarity index 100%
rename from utils/gworksheet.py
rename to src/utils/gworksheet.py
diff --git a/utils/misc.py b/src/utils/misc.py
similarity index 100%
rename from utils/misc.py
rename to src/utils/misc.py
From 04263094ad01122620895df1d00d02e57bcdc5f4 Mon Sep 17 00:00:00 2001
From: msramalho <19508417+msramalho@users.noreply.github.com>
Date: Thu, 10 Nov 2022 17:46:40 +0000
Subject: [PATCH 019/190] WIP docker changes for cli and auto_archiver
---
Dockerfile | 9 +--
src/archivers/base_archiver.py | 40 +++++++++---
src/archivers/instagram_archiver.py | 4 +-
src/archivers/telegram_archiver.py | 4 +-
src/archivers/telethon_archiver.py | 6 +-
src/archivers/tiktok_archiver.py | 10 +--
src/archivers/twitter_api_archiver.py | 4 +-
src/archivers/twitter_archiver.py | 6 +-
src/archivers/vk_archiver.py | 4 +-
src/archivers/wayback_archiver.py | 8 +--
src/archivers/youtubedl_archiver.py | 4 +-
src/auto_archive.py | 87 ++++++++++++++-------------
src/cli.py | 30 +++++++++
src/configs/config.py | 3 +
src/storages/base_storage.py | 9 +++
src/storages/gd_storage.py | 7 ---
src/storages/local_storage.py | 7 ++-
17 files changed, 156 insertions(+), 86 deletions(-)
diff --git a/Dockerfile b/Dockerfile
index 5db284a..96b8405 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -18,16 +18,17 @@ RUN pip install --upgrade pip && \
# TODO: currently disabled see https://github.com/bellingcat/auto-archiver/issues/66
# RUN curl -fsSL https://get.docker.com | sh
-# RUN git clone https://github.com/bellingcat/auto-archiver
# TODO: avoid copying unnecessary files, including .git
COPY Pipfile Pipfile.lock ./
RUN pipenv install --python=3.10 --system --deploy
ENV IS_DOCKER=1
COPY ./src/ .
-# CMD ["pipenv", "run", "python", "auto_archive.py"]
-ENTRYPOINT ["python", "auto_archive.py"]
+# TODO: figure out how to make volumes not be root, does it depend on host or dockerfile?
+# RUN useradd --system --groups sudo --shell /bin/bash archiver && chown -R archiver:sudo .
+# USER archiver
+ENTRYPOINT ["python"]
# ENTRYPOINT ["docker-entrypoint.sh"]
-# should be executed with 2 volumes
+# should be executed with 2 volumes (3 if local_storage)
# docker run -v /var/run/docker.sock:/var/run/docker.sock -v $PWD/secrets:/app/secrets -v $PWD/local_archive:/app/local_archive aa --help
\ No newline at end of file
diff --git a/src/archivers/base_archiver.py b/src/archivers/base_archiver.py
index 5ef2b7e..75395b5 100644
--- a/src/archivers/base_archiver.py
+++ b/src/archivers/base_archiver.py
@@ -1,8 +1,9 @@
import os, datetime, shutil, hashlib, time, requests, re, mimetypes, subprocess
-from dataclasses import dataclass
+from dataclasses import dataclass, field
from abc import ABC, abstractmethod
from urllib.parse import urlparse
from random import randrange
+from collections import defaultdict
import ffmpeg
from loguru import logger
@@ -27,6 +28,7 @@ class ArchiveResult:
screenshot: str = None
wacz: str = None
hash: str = None
+ media: list = field(default_factory=list)
class Archiver(ABC):
name = "default"
@@ -38,6 +40,7 @@ class Archiver(ABC):
self.hash_algorithm = config.hash_algorithm
self.browsertrix = config.browsertrix_config
self.is_docker = config.is_docker
+ self.media = []
def __str__(self):
return self.__class__.__name__
@@ -48,13 +51,28 @@ class Archiver(ABC):
@abstractmethod
def download(self, url, check_if_exists=False): pass
+ def generateArchiveResult(self, **kwargs):
+ # remove duplicates
+ if "cdn_url" in kwargs:
+ self.add_to_media(kwargs["cdn_url"], None, kwargs.get("hash"))
+ kwargs["media"] = [dict(t) for t in {tuple(d.items()) for d in self.media}]
+ return ArchiveResult(**kwargs)
+
def get_netloc(self, url):
return urlparse(url).netloc
+ def add_to_media(self, cdn_url: str, key: str = None, hash: str = None):
+ media_info = {"url": cdn_url, "mime": self._guess_file_type(cdn_url) or "misc"}
+ if key: media_info["key"] = key
+ if hash: media_info["hash"] = hash
+ self.media.append(media_info)
+
def generate_media_page_html(self, url, urls_info: dict, object, thumbnail=None):
"""
Generates an index.html page where each @urls_info is displayed
"""
+ for ui in urls_info:
+ self.add_to_media(ui["cdn_url"], ui["key"], ui["hash"])
page = f'''
{url}
Archived media from {self.name}
@@ -109,6 +127,8 @@ class Archiver(ABC):
For a list of media urls, fetch them, upload them
and call self.generate_media_page_html with them
"""
+ for media_url in urls:
+ self.add_to_media(media_url)
thumbnail = None
uploaded_media = []
@@ -201,17 +221,20 @@ class Archiver(ABC):
self.driver.save_screenshot(filename)
self.storage.upload(filename, key, extra_args={'ACL': 'public-read', 'ContentType': 'image/png'})
- return self.storage.get_cdn_url(key)
+ cdn_url = self.storage.get_cdn_url(key)
+ self.add_to_media(cdn_url, key)
+
+ return cdn_url
def get_wacz(self, url):
if not self.browsertrix.enabled:
logger.debug(f"Browsertrix WACZ generation is not enabled, skipping.")
- return
+ return
if self.is_docker:
# TODO: figure out support for browsertrix in docker
# see: https://github.com/bellingcat/auto-archiver/issues/66
logger.warning(f"Browsertrix WACZ is not yet supported when using DOCKER.")
- return
+ return
logger.debug(f"getting wacz for {url}")
key = self._get_key_from_url(url, ".wacz", append_datetime=True)
@@ -220,7 +243,7 @@ class Archiver(ABC):
browsertrix_home = os.path.join(os.getcwd(), "browsertrix-tmp")
cmd = [
"docker", "run",
- "--rm", # delete container once it has completed running
+ "--rm", # delete container once it has completed running
"-v", f"{browsertrix_home}:/crawls/",
# "-it", # this leads to "the input device is not a TTY"
"webrecorder/browsertrix-crawler", "crawl",
@@ -253,18 +276,19 @@ class Archiver(ABC):
# do not crash if upload fails
try:
self.storage.upload(filename, key, extra_args={
- 'ACL': 'public-read', 'ContentType': 'application/zip'})
+ 'ACL': 'public-read', 'ContentType': 'application/zip'})
except FileNotFoundError as e:
logger.warning(f"Unable to locate and upload WACZ {filename=}, {key=}")
-
# clean up the local browsertrix files
try:
shutil.rmtree(browsertrix_home)
except PermissionError:
logger.warn(f"Unable to clean up browsertrix-crawler files in {browsertrix_home}")
- return self.storage.get_cdn_url(key)
+ cdn_url = self.storage.get_cdn_url(key)
+ self.add_to_media(cdn_url, key)
+ return cdn_url
def get_thumbnails(self, filename, key, duration=None):
thumbnails_folder = os.path.splitext(filename)[0] + os.path.sep
diff --git a/src/archivers/instagram_archiver.py b/src/archivers/instagram_archiver.py
index a2b1147..62db876 100644
--- a/src/archivers/instagram_archiver.py
+++ b/src/archivers/instagram_archiver.py
@@ -52,7 +52,7 @@ class InstagramArchiver(Archiver):
cdn_url = self.storage.get_cdn_url(key)
screenshot = self.get_screenshot(url)
wacz = self.get_wacz(url)
- return ArchiveResult(status='already archived', cdn_url=cdn_url, screenshot=screenshot, wacz=wacz)
+ return self.generateArchiveResult(status='already archived', cdn_url=cdn_url, screenshot=screenshot, wacz=wacz)
try:
# process if post
@@ -137,4 +137,4 @@ class InstagramArchiver(Archiver):
screenshot = self.get_screenshot(url)
wacz = self.get_wacz(url)
- return ArchiveResult(status=status, cdn_url=page_cdn, title=title, timestamp=date, hash=page_hash, screenshot=screenshot, wacz=wacz)
+ return self.generateArchiveResult(status=status, cdn_url=page_cdn, title=title, timestamp=date, hash=page_hash, screenshot=screenshot, wacz=wacz)
diff --git a/src/archivers/telegram_archiver.py b/src/archivers/telegram_archiver.py
index 026bdd0..c6d8747 100644
--- a/src/archivers/telegram_archiver.py
+++ b/src/archivers/telegram_archiver.py
@@ -47,7 +47,7 @@ class TelegramArchiver(Archiver):
time_elements = s.find_all('time')
timestamp = time_elements[0].get('datetime') if len(time_elements) else None
- return ArchiveResult(status="success", cdn_url=page_cdn, screenshot=screenshot, hash=page_hash, thumbnail=thumbnail, timestamp=timestamp, wacz=wacz)
+ return self.generateArchiveResult(status="success", cdn_url=page_cdn, screenshot=screenshot, hash=page_hash, thumbnail=thumbnail, timestamp=timestamp, wacz=wacz)
video_url = video.get('src')
video_id = video_url.split('/')[-1].split('?')[0]
@@ -85,5 +85,5 @@ class TelegramArchiver(Archiver):
os.remove(filename)
cdn_url = self.storage.get_cdn_url(key)
- return ArchiveResult(status=status, cdn_url=cdn_url, thumbnail=key_thumb, thumbnail_index=thumb_index,
+ return self.generateArchiveResult(status=status, cdn_url=cdn_url, thumbnail=key_thumb, thumbnail_index=thumb_index,
duration=duration, title=original_url, timestamp=s.find_all('time')[1].get('datetime'), hash=hash, screenshot=screenshot, wacz=wacz)
diff --git a/src/archivers/telethon_archiver.py b/src/archivers/telethon_archiver.py
index 5c147de..f0ff194 100644
--- a/src/archivers/telethon_archiver.py
+++ b/src/archivers/telethon_archiver.py
@@ -80,7 +80,7 @@ class TelethonArchiver(Archiver):
if check_if_exists and self.storage.exists(key):
# only s3 storage supports storage.exists as not implemented on gd
cdn_url = self.storage.get_cdn_url(key)
- return ArchiveResult(status='already archived', cdn_url=cdn_url, title=post.message, timestamp=post.date, screenshot=screenshot, wacz=wacz)
+ return self.generateArchiveResult(status='already archived', cdn_url=cdn_url, title=post.message, timestamp=post.date, screenshot=screenshot, wacz=wacz)
key_thumb, thumb_index = None, None
group_id = post.grouped_id if post.grouped_id is not None else post.id
@@ -119,7 +119,7 @@ class TelethonArchiver(Archiver):
page_cdn, page_hash, _ = self.generate_media_page_html(url, uploaded_media, html.escape(str(post)))
- return ArchiveResult(status=status, cdn_url=page_cdn, title=message, timestamp=post.date, hash=page_hash, screenshot=screenshot, thumbnail=key_thumb, thumbnail_index=thumb_index, wacz=wacz)
+ return self.generateArchiveResult(status=status, cdn_url=page_cdn, title=message, timestamp=post.date, hash=page_hash, screenshot=screenshot, thumbnail=key_thumb, thumbnail_index=thumb_index, wacz=wacz)
page_cdn, page_hash, _ = self.generate_media_page_html(url, [], html.escape(str(post)))
- return ArchiveResult(status=status, cdn_url=page_cdn, title=post.message, timestamp=getattr_or(post, "date"), hash=page_hash, screenshot=screenshot, wacz=wacz)
+ return self.generateArchiveResult(status=status, cdn_url=page_cdn, title=post.message, timestamp=getattr_or(post, "date"), hash=page_hash, screenshot=screenshot, wacz=wacz)
diff --git a/src/archivers/tiktok_archiver.py b/src/archivers/tiktok_archiver.py
index bdaad52..55cb97e 100644
--- a/src/archivers/tiktok_archiver.py
+++ b/src/archivers/tiktok_archiver.py
@@ -28,9 +28,9 @@ class TiktokArchiver(Archiver):
if len(media) <= 0:
if status == 'already archived':
- return ArchiveResult(status='Could not download media, but already archived', cdn_url=self.storage.get_cdn_url(key))
+ return self.generateArchiveResult(status='Could not download media, but already archived', cdn_url=self.storage.get_cdn_url(key))
else:
- return ArchiveResult(status='Could not download media')
+ return self.generateArchiveResult(status='Could not download media')
logger.info(f'downloading video {key=}')
media[0].download(filename)
@@ -56,17 +56,17 @@ class TiktokArchiver(Archiver):
cdn_url = self.storage.get_cdn_url(key)
timestamp = info.create.isoformat() if hasattr(info, "create") else None
- return ArchiveResult(status=status, cdn_url=cdn_url, thumbnail=key_thumb,
+ return self.generateArchiveResult(status=status, cdn_url=cdn_url, thumbnail=key_thumb,
thumbnail_index=thumb_index, duration=getattr(info, "duration", 0), title=getattr(info, "caption", ""),
timestamp=timestamp, hash=hash, screenshot=screenshot, wacz=wacz)
except tiktok_downloader.Except.InvalidUrl as e:
status = 'Invalid URL'
logger.warning(f'Invalid URL on {url} {e}\n{traceback.format_exc()}')
- return ArchiveResult(status=status)
+ return self.generateArchiveResult(status=status)
except:
error = traceback.format_exc()
status = 'Other Tiktok error: ' + str(error)
logger.warning(f'Other Tiktok error' + str(error))
- return ArchiveResult(status=status)
+ return self.generateArchiveResult(status=status)
diff --git a/src/archivers/twitter_api_archiver.py b/src/archivers/twitter_api_archiver.py
index 454cfe2..da56d31 100644
--- a/src/archivers/twitter_api_archiver.py
+++ b/src/archivers/twitter_api_archiver.py
@@ -40,7 +40,7 @@ class TwitterApiArchiver(TwitterArchiver):
# only s3 storage supports storage.exists as not implemented on gd
cdn_url = self.storage.get_cdn_url(key)
screenshot = self.get_screenshot(url)
- return ArchiveResult(status='already archived', cdn_url=cdn_url, title=tweet.data.text, timestamp=timestamp, screenshot=screenshot)
+ return self.generateArchiveResult(status='already archived', cdn_url=cdn_url, title=tweet.data.text, timestamp=timestamp, screenshot=screenshot)
urls = []
if tweet.includes:
@@ -72,4 +72,4 @@ class TwitterApiArchiver(TwitterArchiver):
screenshot = self.get_screenshot(url)
wacz = self.get_wacz(url)
page_cdn, page_hash, thumbnail = self.generate_media_page(urls, url, output)
- return ArchiveResult(status="success", cdn_url=page_cdn, screenshot=screenshot, hash=page_hash, thumbnail=thumbnail, timestamp=timestamp, title=tweet.data.text, wacz=wacz)
+ return self.generateArchiveResult(status="success", cdn_url=page_cdn, screenshot=screenshot, hash=page_hash, thumbnail=thumbnail, timestamp=timestamp, title=tweet.data.text, wacz=wacz)
diff --git a/src/archivers/twitter_archiver.py b/src/archivers/twitter_archiver.py
index b868af5..f1f22c0 100644
--- a/src/archivers/twitter_archiver.py
+++ b/src/archivers/twitter_archiver.py
@@ -41,7 +41,7 @@ class TwitterArchiver(Archiver):
screenshot = self.get_screenshot(url)
wacz = self.get_wacz(url)
page_cdn, page_hash, _ = self.generate_media_page_html(url, [], html.escape(tweet.json()))
- return ArchiveResult(status="success", cdn_url=page_cdn, title=tweet.content, timestamp=tweet.date, hash=page_hash, screenshot=screenshot, wacz=wacz)
+ return self.generateArchiveResult(status="success", cdn_url=page_cdn, title=tweet.content, timestamp=tweet.date, hash=page_hash, screenshot=screenshot, wacz=wacz)
urls = []
@@ -62,7 +62,7 @@ class TwitterArchiver(Archiver):
screenshot = self.get_screenshot(url)
wacz = self.get_wacz(url)
- return ArchiveResult(status="success", cdn_url=page_cdn, screenshot=screenshot, hash=page_hash, thumbnail=thumbnail, timestamp=tweet.date, title=tweet.content, wacz=wacz)
+ return self.generateArchiveResult(status="success", cdn_url=page_cdn, screenshot=screenshot, hash=page_hash, thumbnail=thumbnail, timestamp=tweet.date, title=tweet.content, wacz=wacz)
def download_alternative(self, url, tweet_id):
# https://stackoverflow.com/a/71867055/6196010
@@ -87,7 +87,7 @@ class TwitterArchiver(Archiver):
screenshot = self.get_screenshot(url)
wacz = self.get_wacz(url)
page_cdn, page_hash, thumbnail = self.generate_media_page(urls, url, r.text)
- return ArchiveResult(status="success", cdn_url=page_cdn, screenshot=screenshot, hash=page_hash, thumbnail=thumbnail, timestamp=timestamp, title=tweet["text"], wacz=wacz)
+ return self.generateArchiveResult(status="success", cdn_url=page_cdn, screenshot=screenshot, hash=page_hash, thumbnail=thumbnail, timestamp=timestamp, title=tweet["text"], wacz=wacz)
def choose_variant(self, variants):
# choosing the highest quality possible
diff --git a/src/archivers/vk_archiver.py b/src/archivers/vk_archiver.py
index 91b8354..1d38fa9 100644
--- a/src/archivers/vk_archiver.py
+++ b/src/archivers/vk_archiver.py
@@ -31,7 +31,7 @@ class VkArchiver(Archiver):
# if check_if_exists and self.storage.exists(key):
# screenshot = self.get_screenshot(url)
# cdn_url = self.storage.get_cdn_url(key)
- # return ArchiveResult(status="already archived", cdn_url=cdn_url, screenshot=screenshot)
+ # return self.generateArchiveResult(status="already archived", cdn_url=cdn_url, screenshot=screenshot)
results = self.vks.scrape(url) # some urls can contain multiple wall/photo/... parts and all will be fetched
if len(results) == 0:
@@ -71,4 +71,4 @@ class VkArchiver(Archiver):
# # if multiple wall/photos/videos are present the screenshot will only grab the 1st
screenshot = self.get_screenshot(url)
wacz = self.get_wacz(url)
- return ArchiveResult(status="success", cdn_url=page_cdn, screenshot=screenshot, hash=page_hash, thumbnail=thumbnail, thumbnail_index=thumbnail_index, timestamp=datetime, title=title, wacz=wacz)
+ return self.generateArchiveResult(status="success", cdn_url=page_cdn, screenshot=screenshot, hash=page_hash, thumbnail=thumbnail, thumbnail_index=thumbnail_index, timestamp=datetime, title=title, wacz=wacz)
diff --git a/src/archivers/wayback_archiver.py b/src/archivers/wayback_archiver.py
index e0ede90..1bfa78a 100644
--- a/src/archivers/wayback_archiver.py
+++ b/src/archivers/wayback_archiver.py
@@ -39,7 +39,7 @@ class WaybackArchiver(Archiver):
if r.status_code != 200:
logger.warning(f"Internet archive failed with status of {r.status_code}")
- return ArchiveResult(status="Internet archive failed", screenshot=screenshot, wacz=wacz)
+ return self.generateArchiveResult(status="Internet archive failed", screenshot=screenshot, wacz=wacz)
if 'job_id' not in r.json() and 'message' in r.json():
return self.custom_retry(r.json(), screenshot=screenshot, wacz=wacz)
@@ -61,7 +61,7 @@ class WaybackArchiver(Archiver):
retries += 1
if status_r.status_code != 200:
- return ArchiveResult(status=f"Internet archive failed: check https://web.archive.org/save/status/{job_id}", screenshot=screenshot, wacz=wacz)
+ return self.generateArchiveResult(status=f"Internet archive failed: check https://web.archive.org/save/status/{job_id}", screenshot=screenshot, wacz=wacz)
status_json = status_r.json()
if status_json['status'] != 'success':
@@ -77,7 +77,7 @@ class WaybackArchiver(Archiver):
title = 'Could not get title'
except:
title = "Could not get title"
- self.seen_urls[url] = ArchiveResult(status='success', cdn_url=archive_url, title=title, screenshot=screenshot, wacz=wacz)
+ self.seen_urls[url] = self.generateArchiveResult(status='success', cdn_url=archive_url, title=title, screenshot=screenshot, wacz=wacz)
return self.seen_urls[url]
def custom_retry(self, json_data, **kwargs):
@@ -86,4 +86,4 @@ class WaybackArchiver(Archiver):
return self.signal_retry_in(**kwargs)
if "this host has been already captured" in str(json_data).lower():
return self.signal_retry_in(**kwargs, min_seconds=86400, max_seconds=129600) # 24h to 36h later
- return ArchiveResult(status=f"Internet archive failed: {json_data}", **kwargs)
+ return self.generateArchiveResult(status=f"Internet archive failed: {json_data}", **kwargs)
diff --git a/src/archivers/youtubedl_archiver.py b/src/archivers/youtubedl_archiver.py
index 5d09442..e2f27a2 100644
--- a/src/archivers/youtubedl_archiver.py
+++ b/src/archivers/youtubedl_archiver.py
@@ -38,7 +38,7 @@ class YoutubeDLArchiver(Archiver):
if info.get('is_live', False):
logger.warning("Live streaming media, not archiving now")
- return ArchiveResult(status="Streaming media")
+ return self.generateArchiveResult(status="Streaming media")
if 'twitter.com' in netloc:
if 'https://twitter.com/' in info['webpage_url']:
@@ -114,5 +114,5 @@ class YoutubeDLArchiver(Archiver):
elif 'upload_date' in info and info['upload_date'] is not None:
timestamp = datetime.datetime.strptime(info['upload_date'], '%Y%m%d').replace(tzinfo=datetime.timezone.utc)
- return ArchiveResult(status=status, cdn_url=cdn_url, thumbnail=key_thumb, thumbnail_index=thumb_index, duration=duration,
+ return self.generateArchiveResult(status=status, cdn_url=cdn_url, thumbnail=key_thumb, thumbnail_index=thumb_index, duration=duration,
title=info['title'] if 'title' in info else None, timestamp=timestamp, hash=hash, screenshot=screenshot, wacz=wacz)
diff --git a/src/auto_archive.py b/src/auto_archive.py
index 3412b0a..a797405 100644
--- a/src/auto_archive.py
+++ b/src/auto_archive.py
@@ -57,7 +57,7 @@ def missing_required_columns(gw: GWorksheet):
return missing
-def should_process_sheet(c, sheet_name):
+def should_process_sheet(c: Config, sheet_name):
if len(c.worksheet_allow) and sheet_name not in c.worksheet_allow:
# ALLOW rules exist AND sheet name not explicitly allowed
return False
@@ -67,6 +67,50 @@ def should_process_sheet(c, sheet_name):
return True
+def archive_url(c: Config, url: str, folder: str, debug_string: str, is_retry: bool):
+ url = expand_url(url)
+ c.set_folder(folder)
+ storage = c.get_storage()
+
+ # make a new driver so each spreadsheet row is idempotent
+ c.recreate_webdriver()
+
+ # order matters, first to succeed excludes remaining
+ active_archivers = [
+ TelethonArchiver(storage, c),
+ TiktokArchiver(storage, c),
+ TwitterApiArchiver(storage, c),
+ InstagramArchiver(storage, c),
+ YoutubeDLArchiver(storage, c),
+ TelegramArchiver(storage, c),
+ TwitterArchiver(storage, c),
+ VkArchiver(storage, c),
+ WaybackArchiver(storage, c)
+ ]
+
+ for archiver in active_archivers:
+ logger.debug(f'Trying {archiver} on {debug_string}')
+
+ try:
+ result = archiver.download(url, check_if_exists=c.check_if_exists)
+ except KeyboardInterrupt as e: raise e # so the higher level catch can catch it
+ except Exception as e:
+ result = False
+ logger.error(f'Got unexpected error in {debug_string} with {archiver.name} for {url=}: {e}\n{traceback.format_exc()}')
+
+ if result:
+ success = result.status in ['success', 'already archived']
+ result.status = f"{archiver.name}: {result.status}"
+ if success:
+ logger.success(f'{archiver.name} succeeded on {debug_string}, {url=}')
+ break
+ # only 1 retry possible for now
+ if is_retry and Archiver.is_retry(result.status):
+ result.status = Archiver.remove_retry(result.status)
+ logger.warning(f'{archiver.name} did not succeed on {debug_string}, final status: {result.status}')
+ return result
+
+
def process_sheet(c: Config):
sh = c.gsheets_client.open(c.sheet)
@@ -100,46 +144,7 @@ def process_sheet(c: Config):
# All checks done - archival process starts here
try:
gw.set_cell(row, 'status', 'Archive in progress')
- url = expand_url(url)
- c.set_folder(gw.get_cell_or_default(row, 'folder', default_folder, when_empty_use_default=True))
-
- # make a new driver so each spreadsheet row is idempotent
- c.recreate_webdriver()
-
- # order matters, first to succeed excludes remaining
- active_archivers = [
- TelethonArchiver(storage, c),
- TiktokArchiver(storage, c),
- TwitterApiArchiver(storage, c),
- InstagramArchiver(storage, c),
- YoutubeDLArchiver(storage, c),
- TelegramArchiver(storage, c),
- TwitterArchiver(storage, c),
- VkArchiver(storage, c),
- WaybackArchiver(storage, c)
- ]
-
- for archiver in active_archivers:
- logger.debug(f'Trying {archiver} on {row=}')
-
- try:
- result = archiver.download(url, check_if_exists=c.check_if_exists)
- except KeyboardInterrupt as e: raise e # so the higher level catch can catch it
- except Exception as e:
- result = False
- logger.error(f'Got unexpected error in row {row} with {archiver.name} for {url=}: {e}\n{traceback.format_exc()}')
-
- if result:
- success = result.status in ['success', 'already archived']
- result.status = f"{archiver.name}: {result.status}"
- if success:
- logger.success(f'{archiver.name} succeeded on {row=}, {url=}')
- break
- # only 1 retry possible for now
- if is_retry and Archiver.is_retry(result.status):
- result.status = Archiver.remove_retry(result.status)
- logger.warning(f'{archiver.name} did not succeed on {row=}, final status: {result.status}')
-
+ result = archive_url(c, url, gw.get_cell_or_default(row, 'folder', default_folder, when_empty_use_default=True), f"{row=}", is_retry=is_retry)
if result:
update_sheet(gw, row, url, result)
else:
diff --git a/src/cli.py b/src/cli.py
index e69de29..b6d2b70 100644
--- a/src/cli.py
+++ b/src/cli.py
@@ -0,0 +1,30 @@
+import tempfile, json
+import auto_archive
+from loguru import logger
+from configs import Config
+from storages import Storage
+from slugify import slugify
+
+
+def main():
+ c = Config()
+ c.parse()
+ url = c.url
+ if not url:
+ logger.error("Invalid URL: '{url}'")
+ return
+ logger.info(f'Archiving "{url=}".')
+ with tempfile.TemporaryDirectory(dir="./") as tmpdir:
+ Storage.TMP_FOLDER = tmpdir
+ result = auto_archive.archive_url(c, url, "", f"{url=}", False)
+ c.destroy_webdriver()
+ key = f"media_{slugify(url)}.json"
+ with open(key, "w", encoding="utf-8") as outf:
+ json.dump(result.media, outf, ensure_ascii=False, indent=4)
+ c.get_storage().upload(key, key)
+ print(result)
+ return result
+
+
+if __name__ == "__main__":
+ main()
diff --git a/src/configs/config.py b/src/configs/config.py
index 01b8173..bbd385e 100644
--- a/src/configs/config.py
+++ b/src/configs/config.py
@@ -47,6 +47,8 @@ class Config:
with open(self.config_file, "r", encoding="utf-8") as inf:
self.config = yaml.safe_load(inf)
+ self.url = getattr_or(self.args, "url", '')
+
# ----------------------EXECUTION - execution configurations
execution = self.config.get("execution", {})
@@ -211,6 +213,7 @@ class Config:
"""
parser = argparse.ArgumentParser(description='Automatically archive social media posts, videos, and images from a Google Sheets document. The command line arguments will always override the configurations in the provided YAML config file (--config), only some high-level options are allowed via the command line and the YAML configuration file is the preferred method. The sheet must have the "url" and "status" for the archiver to work. ')
+ parser.add_argument('--url', action='store', dest='url', help='single URL to archive - to use only via cli.py and not google sheets interaction')
parser.add_argument('--config', action='store', dest='config', help='the filename of the YAML configuration file (defaults to \'config.yaml\')', default='config.yaml')
parser.add_argument('--storage', action='store', dest='storage', help='which storage to use [execution.storage in config.yaml]', choices=Config.AVAILABLE_STORAGES)
parser.add_argument('--sheet', action='store', dest='sheet', help='the name of the google sheets document [execution.sheet in config.yaml]')
diff --git a/src/storages/base_storage.py b/src/storages/base_storage.py
index cde00fe..f147678 100644
--- a/src/storages/base_storage.py
+++ b/src/storages/base_storage.py
@@ -1,3 +1,4 @@
+import os, uuid
from loguru import logger
from abc import ABC, abstractmethod
from pathlib import Path
@@ -18,6 +19,14 @@ class Storage(ABC):
@abstractmethod
def uploadf(self, file, key, **kwargs): pass
+ def clean_key(self, key):
+ # Some storages does not work well with trailing forward slashes and some keys come with that
+ if key.startswith('/'):
+ logger.debug(f'Found and fixed a leading "/" for {key=}')
+ return key[1:]
+ return key
+
+
def upload(self, filename: str, key: str, **kwargs):
logger.debug(f'[{self.__class__.__name__}] uploading file {filename} with key {key}')
with open(filename, 'rb') as f:
diff --git a/src/storages/gd_storage.py b/src/storages/gd_storage.py
index 5f3bbeb..3af77f1 100644
--- a/src/storages/gd_storage.py
+++ b/src/storages/gd_storage.py
@@ -116,13 +116,6 @@ class GDStorage(Storage):
# GD only requires the filename not a file reader
self.uploadf(filename, key, **kwargs)
- def clean_key(self, key):
- # GDrive does not work well with trailing forward slashes and some keys come with that
- if key.startswith('/'):
- logger.debug(f'Found and fixed a leading "/" for {key=}')
- return key[1:]
- return key
-
# gets the Drive folderID if it is there
def _get_id_from_parent_and_name(self, parent_id: str, name: str, retries: int = 1, sleep_seconds: int = 10, use_mime_type: bool = False, raise_on_missing: bool = True, use_cache=False):
"""
diff --git a/src/storages/local_storage.py b/src/storages/local_storage.py
index ca328e0..1109767 100644
--- a/src/storages/local_storage.py
+++ b/src/storages/local_storage.py
@@ -1,6 +1,7 @@
import os
from dataclasses import dataclass
+from loguru import logger
from .base_storage import Storage
from utils import mkdir_if_not_exists
@@ -18,8 +19,12 @@ class LocalStorage(Storage):
mkdir_if_not_exists(self.save_to)
def get_cdn_url(self, key):
+ key = self.clean_key(key)
+ logger.info(f"{key=}")
full_path = os.path.join(self.save_to, self.folder, key)
- mkdir_if_not_exists(os.path.join(*full_path.split(os.path.sep)[0:-1]))
+ logger.debug(f"{full_path=} creating dir structure to {os.path.dirname(full_path)}")
+ os.makedirs(os.path.dirname(full_path), exist_ok=True)
+ # mkdir_if_not_exists(os.path.join(*full_path.split(os.path.sep)[0:-1]))
return os.path.abspath(full_path)
def exists(self, key):
From 6a0ce5ced18d94bd9a7454cfe77f079ff80313f8 Mon Sep 17 00:00:00 2001
From: msramalho <19508417+msramalho@users.noreply.github.com>
Date: Fri, 11 Nov 2022 02:08:48 +0000
Subject: [PATCH 020/190] orchestrator design structure
---
orchestrate.yaml | 48 ++++++++++
src/orchestrator.py | 215 ++++++++++++++++++++++++++++++++++++++++++++
2 files changed, 263 insertions(+)
create mode 100644 orchestrate.yaml
create mode 100644 src/orchestrator.py
diff --git a/orchestrate.yaml b/orchestrate.yaml
new file mode 100644
index 0000000..9a4ec42
--- /dev/null
+++ b/orchestrate.yaml
@@ -0,0 +1,48 @@
+steps:
+ # only 1 feeder allowed
+ # a feeder could be in an "infinite loop" for example: gsheets_infinite feeder which holds-> this could be an easy logic addiction by modifying for each to while not feeder.done() if it becomes necessary
+ feeder: gsheets_feeder # default -> only expects URL from CLI
+ archivers: # order matters
+ - tiktok
+ - telethon
+ - twitter
+ - instagram
+ - webarchive # this way it runs as a failsafe only
+ enrichments:
+ - screenshot
+ - wacz
+ - webarchive # this way it runs for every case, webarchive extends archiver and enrichment
+ - thumbnails
+ formatters:
+ - HTMLFormater
+ - PDFFormater
+ storages:
+ - local_storage
+ - s3
+ databases:
+ - gsheets_db
+ - mongo_db
+
+
+
+configurations:
+ gsheets_feeder:
+ - sheet: "Auto archiver"
+ - header: "" # defaults to 1 in GSheetsFeeder
+ - service_account: "secrets/service_account.json"
+ tiktok:
+ username: "abc"
+ password: "123"
+ token: "here"
+ screenshot:
+ width: 1280
+ height: 720
+ wacz:
+ profile: secrets/profile.tar.gz
+ webarchive:
+ api_key: "12345"
+ s3:
+ - bucket: 123
+ - region: "nyc3"
+ - cdn: "{region}{bucket}"
+
diff --git a/src/orchestrator.py b/src/orchestrator.py
new file mode 100644
index 0000000..30f7b3c
--- /dev/null
+++ b/src/orchestrator.py
@@ -0,0 +1,215 @@
+from typing import Union, Dict
+from __future__ import annotations
+from dataclasses import dataclass
+
+"""
+how not to couple the different pieces of logic
+due to the use of constants for the metadata keys?
+perhaps having methods on the Metadata level that can be used to fetch a limited number of
+keys, never using strings but rather methods?
+eg: m = Metadata()
+ m.get("screenshot") vs m.get_all()
+ m.get_url()
+ m.get_hash()
+ m.get_main_file().get_title()
+ m.get_screenshot() # this method should only exist because of the Screenshot Enricher
+ # maybe there is a way for Archivers and Enrichers and Storages to add their own methdods
+ # which raises still the Q of how the database, eg., knows they exist?
+ # maybe there's a function to fetch them all, and each Database can register wathever they get
+ # for eg the GoogleSheets will only register based on the available column names, it knows what it wants
+ # and if it's there: great, otherwise business as usual.
+ # and a MongoDatabase could register all data, for example.
+ #
+How are Orchestrators created? from a configuration file?
+ orchestrator = ArchivingOrchestrator(config)
+ # Config contains 1 URL, or URLs, from the command line
+ # OR a feeder which is described in the config file
+ # config.get_feeder() # if called as docker run --url "http...." then the uses the default filter
+ # if config.yaml says config
+ orchestrator.start()
+
+
+Example applications:
+1. auto-archiver for GSheets
+2. archiver for URL: feeder is CLIFeeder(config.cli.urls="") # --urls="u1,u2"
+3. archiver backend for a UI that implements a REST API, the API calls CLI
+
+Cisticola considerations:
+1. By isolating the archiving logic into "Archiving only pieces of logic" these could simply call cisticola.tiktok_scraper(user, pass)
+2. So the auto-archiver becomes like a puzzle and fixes to Cisticola scrapers can immediately benefit it, and contributions are focused on a single source or scraping
+"""
+
+@dataclass
+class Metadata:
+ # does not handle files, only primitives
+ # the only piece of logic to handle files is the archiver, enricher, and storage
+ status: str
+ # title: str
+ # url: str
+ # hash: str
+ main_file: Metadata
+ metadata: Dict[str, Metadata]
+
+ @staticmethod
+ def merge(left, right : Metadata, overwrite_left=True) -> Metadata:
+ # should return a merged version of the Metadata
+ # will work for archived() and enriched()
+ # what if 2 metadatas contain the same keys? only one can remain! : overwrite_left
+ pass
+
+ def get(self, key) -> Union[Metadata, str]:
+ # goes through metadata and returns the Metadata available
+ pass
+
+ def as_json(self) -> str:
+ # converts all metadata and data into JSON
+ pass
+
+
+"""
+@dataclass
+class ArchiveResult:
+ # maybe metadata can have status as well, eg: screenshot fails. should that be registered in the databases? likely yes
+ status: str
+ url: str
+ metadata: Metadata
+ # title, url, hash, other={}
+ # cdn_url: str = None
+ # thumbnail: str = None
+ # thumbnail_index: str = None
+ # duration: float = None
+ # title: str = None
+ # timestamp: datetime.datetime = None
+ # screenshot: str = None
+ # wacz: str = None
+ # hash: str = None
+ # media: list = field(default_factory=list)
+
+ def __init__(self) -> None: pass
+
+ def update(self, metadata) -> None:
+ # receive a Metadata instance and update itself with it!
+ pass
+
+ def as_json(self) -> str:
+ # converts all metadata and data into JSON
+ pass
+"""
+
+"""
+There is a Superclass for:
+ * Database (should_process)
+
+How can GSheets work? it needs to feed from a READER (GSheets Feeder)
+
+Once an archiver returns a link to a local file (for eg to a storage), how do we then delete the produced local files?
+The context metadata should include a temporary folder (maybe a LocalStorage instance?)
+"""
+
+class ArchivingOrchestrator:
+ def __init__(self, config) -> None:
+ # in config.py we should test that the archivers exist and log mismatches (blocking execution)
+ # identify each formatter, storage, database, etc
+ self.feeder = Feeder.init(config.feeder, config.get(config.feeder))
+
+ # Is it possible to overwrite config.yaml values? it could be useful: share config file and modify gsheets_feeder.sheet via CLI
+ # where does that update/processing happen? in config.py
+ # reflection for Archiver to know wihch child classes it has? use Archiver.__subclasses__
+ self.archivers = [
+ Archiver.init(a, config.get(a))
+ for a in config.archivers
+ ]
+
+ self.enrichments = [
+ Enrichment.init(e, config.get(e))
+ for e in config.enrichments
+ ]
+
+ self.formatters = [
+ Formatter.init(f, config.get(f))
+ for f in config.formatters
+ ]
+
+ self.storages = [
+ Storage.init(s, config.get(s))
+ for s in config.storages
+ ]
+
+ self.databases = [
+ Database.init(f, config.get(f))
+ for f in config.formatters
+ ]
+
+ # these rules are checked in config.py
+ assert len(archivers) > 1, "there needs to be at least one Archiver"
+
+ def feed(self, feeder: Feeder) -> list(ArchiveResult):
+ for next in feeder:
+ self.archive(next)
+ # how does this handle the parameters like folder which can be different for each archiver?
+ # the storage needs to know where to archive!!
+ # solution: feeders have context: extra metadata that they can read or ignore,
+ # all of it should have sensible defaults (eg: folder)
+ # default feeder is a list with 1 element
+
+ def archive(url) -> Union[ArchiveResult, None]:
+ url = clear_url(url)
+ result = Metadata(url=url)
+
+
+ should_archive = True
+ for d in databases: should_archive &= d.should_process(url)
+ # should storages also be able to check?
+ for s in storages: should_archive &= s.should_process(url)
+
+ if not should_archive:
+ return "skipping"
+
+ # signal to DB that archiving has started
+ for d in databases:
+ # are the databases to decide whether to archive?
+ # they can simply return True by default, otherwise they can avoid duplicates. should this logic be more granular, for example on the archiver level: a tweet will not need be scraped twice, whereas an instagram profile might. the archiver could not decide from the link which parts to archive,
+ # instagram profile example: it would always re-archive everything
+ # maybe the database/storage could use a hash/key to decide if there's a need to re-archive
+ if d.should_process(url):
+ d.started(url)
+ elif d.exists(url):
+ return d.fetch(url)
+ else:
+ print("Skipping url")
+ return
+
+ # vk, telethon, ...
+ for a in archivers:
+ # with automatic try/catch in download + archived (+ the other ops below)
+ # should the archivers come with the config already? are there configs which change at runtime?
+ # think not, so no need to pass config as parameter
+ # do they need to be refreshed with every execution?
+ # this is where the Hashes come from, the place with access to all content
+ # the archiver does not have access to storage
+ result.update(a.download(url))
+ if result.is_success(): break
+
+ # what if an archiver returns multiple entries and one is to be part of HTMLgenerator?
+ # should it call the HTMLgenerator as if it's not an enrichment?
+ # eg: if it is enable: generates an HTML with all the returned media, should it include enrichments? yes
+ # then how to execute it last? should there also be post-processors? are there other examples?
+ # maybe as a PDF? or a Markdown file
+ # side captures: screenshot, wacz, webarchive, thumbnails, HTMLgenerator
+ for e in enrichments:
+ result.update(e.enrich(result))
+
+ # formatters, enrichers, and storages will sometimes look for specific properties: eg
Screenshot:
+ for p in formatter:
+ result.update(p.process(result))
+
+ # storages
+ for s in storages:
+ for m in result.media:
+ m.update(s.store(m))
+
+ # signal completion to databases (DBs, Google Sheets, CSV, ...)
+ # a hash registration service could be one database: forensic archiving
+ for d in databases: d.done( result)
+
+ return result
\ No newline at end of file
From 65dd155c9047041f468a3a4e44866fae858d6cb2 Mon Sep 17 00:00:00 2001
From: msramalho <19508417+msramalho@users.noreply.github.com>
Date: Tue, 15 Nov 2022 15:00:52 +0000
Subject: [PATCH 021/190] WIP refactor logic
---
README.md | 40 ++++++++++++--
orchestrate.yaml | 8 +--
src/configs/v2config.py | 80 ++++++++++++++++++++++++++++
src/enrichers/__init__.py | 2 +
src/enrichers/enricher.py | 20 +++++++
src/enrichers/enricher_screenshot.py | 53 ++++++++++++++++++
src/metadata.py | 30 +++++++++++
src/orchestrator.py | 64 +++++++++++-----------
src/step.py | 30 +++++++++++
src/utils/__init__.py | 3 +-
src/utils/util.py | 20 +++++++
src/v2.py | 9 ++++
12 files changed, 320 insertions(+), 39 deletions(-)
create mode 100644 src/configs/v2config.py
create mode 100644 src/enrichers/__init__.py
create mode 100644 src/enrichers/enricher.py
create mode 100644 src/enrichers/enricher_screenshot.py
create mode 100644 src/metadata.py
create mode 100644 src/step.py
create mode 100644 src/utils/util.py
create mode 100644 src/v2.py
diff --git a/README.md b/README.md
index 8bdc7d5..11ff002 100644
--- a/README.md
+++ b/README.md
@@ -4,15 +4,40 @@ Read the [article about Auto Archiver on bellingcat.com](https://www.bellingcat.
Python script to automatically archive social media posts, videos, and images from a Google Sheets document. Uses different archivers depending on the platform, and can save content to local storage, S3 bucket (Digital Ocean Spaces, AWS, ...), and Google Drive. The Google Sheets where the links come from is updated with information about the archived content. It can be run manually or on an automated basis.
+
+# Requirement configurations
+# Running with docker
+# Running without docker
+
+
+
+### Setup checklist
+Use this to make sure you help making sure you did all the required steps:
+* [ ] you have a `/secrets` folder with all your configuration files including
+ * [ ] a configuration file eg: `config.yaml` pointing to the correct location of other files
+ * [ ] you have a `service_account.json`
+ * [ ] (optional for telegram) a `anon.session` which appears after the 1st run to avoid logging into the
+ * [ ] (optional for VK) a `vk_config.v2.json`
+ * [ ] (optional for using GoogleDrive storage) `gd-token.json`
+ * [ ] (optional for instagram) `instaloader.session` file which appears after the 1st run and login in telegram
+ * [ ] (optional for browsertrix) `profile.tar.gz` file
+
## Setup
+### Always required
+1. [A Google Service account is necessary for use with `gspread`.](https://gspread.readthedocs.io/en/latest/oauth2.html#for-bots-using-service-account) Credentials for this account should be stored in `service_account.json`, in the same directory as the script.
+2. A configuration file, see [Configuration file](#configuration-file).
-Check this [tutorial video](https://youtu.be/VfAhcuV2tLQ).
+### With docker image
+[Docker](https://www.docker.com/) is like a virtual machine program that isolates all the installation dependencies needed for the auto-archiver and it should be the only thing you need to install.
+
+### Without docker
+Check this [tutorial video](https://youtu.be/VfAhcuV2tLQ) for setup without the docker image.
If you are using `pipenv` (recommended), `pipenv install` is sufficient to install Python prerequisites.
-You also need:
+You need to install the following requirements on your machine:
1. [A Google Service account is necessary for use with `gspread`.](https://gspread.readthedocs.io/en/latest/oauth2.html#for-bots-using-service-account) Credentials for this account should be stored in `service_account.json`, in the same directory as the script.
2. [ffmpeg](https://www.ffmpeg.org/) must also be installed locally for this tool to work.
3. [firefox](https://www.mozilla.org/en-US/firefox/new/) and [geckodriver](https://github.com/mozilla/geckodriver/releases) on a path folder like `/usr/local/bin`.
@@ -22,7 +47,7 @@ You also need:
1. To improve the websites browsertrix can archive you can also create a custom profile by running `docker run -p 9222:9222 -p 9223:9223 -v $PWD/browsertrix/crawls/profiles:/crawls/profiles/ -it webrecorder/browsertrix-crawler create-login-profile --interactive --url "https://youtube.com"`, going to [http://localhost:9223/](http://localhost:9223/) and accepting the cookies prompt on youtube, and then navigating to other websites and logging in as per your needs, so as to access more publicly blocked content, and then specifying the created `profile.tar.gz` in your config file under `execution.browsertrix.profile`.
### Configuration file
-Configuration is done via a config.yaml file (see [example.config.yaml](example.config.yaml)) and some properties of that file can be overwritten via command line arguments. Here is the current result from running the `python auto_archive.py --help`:
+Configuration is done via a config.yaml file (see [example.config.yaml](example.config.yaml)) and some properties of that file can be overwritten via command line arguments. Make a copy of that file and rename it to your liking eg. `config-test.yaml` . Here is the current result from running the `python auto_archive.py --help`:
python auto_archive.py --help
@@ -151,6 +176,15 @@ To make it easier to set up new auto-archiver sheets, the auto-auto-archiver wil

+# Docker development
+* working with docker locally:
+ * `docker build . -t auto-archiver` to build a local image
+ * `docker run --rm -v $PWD/secrets:/app/secrets aa --config secrets/config.yaml`
+ * to use local archive, also create a volume `-v` for it by adding `-v $PWD/local_archive:/app/local_archive`
+* release to docker hub
+ * `docker image tag auto-archiver bellingcat/auto-archiver:latest`
+ * `docker push bellingcat/auto-archiver` (validate [here]())
+
# Code structure
Code is split into functional concepts:
1. [Archivers](archivers/) - receive a URL that they try to archive
diff --git a/orchestrate.yaml b/orchestrate.yaml
index 9a4ec42..689765f 100644
--- a/orchestrate.yaml
+++ b/orchestrate.yaml
@@ -8,14 +8,14 @@ steps:
- twitter
- instagram
- webarchive # this way it runs as a failsafe only
- enrichments:
+ enrichers:
- screenshot
- wacz
- webarchive # this way it runs for every case, webarchive extends archiver and enrichment
- thumbnails
formatters:
- HTMLFormater
- - PDFFormater
+ - PdfFormater
storages:
- local_storage
- s3
@@ -26,6 +26,8 @@ steps:
configurations:
+ global:
+ - save_logs: False
gsheets_feeder:
- sheet: "Auto archiver"
- header: "" # defaults to 1 in GSheetsFeeder
@@ -36,7 +38,7 @@ configurations:
token: "here"
screenshot:
width: 1280
- height: 720
+ height: 720000
wacz:
profile: secrets/profile.tar.gz
webarchive:
diff --git a/src/configs/v2config.py b/src/configs/v2config.py
new file mode 100644
index 0000000..4b072c3
--- /dev/null
+++ b/src/configs/v2config.py
@@ -0,0 +1,80 @@
+
+
+import argparse, yaml
+from dataclasses import dataclass, field
+from typing import List
+from step import Step
+from utils import Util
+from enrichers import Enricher
+from collections import defaultdict
+
+
+@dataclass
+class ConfigV2:
+ # TODO: should Config inherit from Step so it can have it's own configurations?
+ configurable_parents = [
+ Enricher,
+ Util
+ ]
+ feeder : Step #TODO:= BaseFeeder
+ archivers: List[Step] = field(default_factory=[]) #TODO: fix type
+ enrichers: List[Enricher] = field(default_factory=[])
+ formatters: List[Step] = field(default_factory=[]) #TODO: fix type
+ storages: List[Step] = field(default_factory=[]) #TODO: fix type
+ databases: List[Step] = field(default_factory=[]) #TODO: fix type
+
+ def __init__(self) -> None:
+ self.defaults = {}
+ self.config = {}
+
+ def parse(self):
+ # 1. parse CLI values
+ parser = argparse.ArgumentParser(
+ # prog = "auto-archiver",
+ description="Auto Archiver is a ...!",
+ epilog="Check the code at https://github.com/bellingcat/auto-archiver"
+ )
+
+ parser.add_argument('--config', action='store', dest='config', help='the filename of the YAML configuration file (defaults to \'config.yaml\')', default='config.yaml')
+
+ for configurable in self.configurable_parents:
+ child: Step
+ for child in configurable.__subclasses__():
+ for config, details in child.configs().items():
+ assert "." not in child.name, f"class prop name cannot contain dots('.'): {child.name}"
+ assert "." not in config, f"config property cannot contain dots('.'): {config}"
+ config_path = f"{child.name}.{config}"
+ parser.add_argument(f'--{config_path}', action='store', dest=config_path, help=details['help'])
+ self.defaults[config_path] = details["default"]
+
+ args = parser.parse_args()
+
+ # 2. read YAML config file
+ with open(args.config, "r", encoding="utf-8") as inf:
+ self.yaml_config = yaml.safe_load(inf)
+
+ # 3. CONFIGS: decide value with priority: CLI >> config.yaml >> default
+ self.config = defaultdict(dict)
+ for config_path, default in self.defaults.items():
+ child, config = tuple(config_path.split("."))
+ val = getattr(args, config_path)
+ if val is None:
+ val = self.yaml_config.get("configurations", {}).get(child, {}).get(config, default)
+ self.config[child][config] = val
+ self.config = dict(self.config)
+
+ # 4. STEPS: read steps and validate they exist
+ steps = self.yaml_config.get("steps", {})
+ assert "archivers" in steps, "your configuration steps are missing the archivers property"
+ assert "storages" in steps, "your configuration steps are missing the storages property"
+
+ print(self.config)
+
+ # self.feeder = Feeder.init
+ self.enrichers = [Enricher.init(steps.get("enrichers", [])[0], self.config)]
+
+
+ print(self.enrichers)
+
+ def validate(self):
+ pass
diff --git a/src/enrichers/__init__.py b/src/enrichers/__init__.py
new file mode 100644
index 0000000..3c266f8
--- /dev/null
+++ b/src/enrichers/__init__.py
@@ -0,0 +1,2 @@
+from .enricher import Enricher
+from .enricher_screenshot import ScreenshotEnricher
\ No newline at end of file
diff --git a/src/enrichers/enricher.py b/src/enrichers/enricher.py
new file mode 100644
index 0000000..c767b8e
--- /dev/null
+++ b/src/enrichers/enricher.py
@@ -0,0 +1,20 @@
+from __future__ import annotations
+from dataclasses import dataclass
+from abc import abstractmethod, ABC
+from metadata import Metadata
+from step import Step
+
+@dataclass
+class Enricher(Step, ABC):
+ name = "enricher"
+
+ def __init__(self, config: dict) -> None:
+ Step.__init__(self)
+
+
+ # only for typing...
+ def init(name: str, config: dict) -> Enricher:
+ return Step.init(name, config, Enricher)
+
+ @abstractmethod
+ def enrich(self, item: Metadata) -> Metadata: pass
diff --git a/src/enrichers/enricher_screenshot.py b/src/enrichers/enricher_screenshot.py
new file mode 100644
index 0000000..04a2bf0
--- /dev/null
+++ b/src/enrichers/enricher_screenshot.py
@@ -0,0 +1,53 @@
+from . import Enricher
+from metadata import Metadata
+from loguru import logger
+
+
+class ScreenshotEnricher(Enricher):
+ name = "screenshot"
+
+ @staticmethod
+ def configs() -> dict:
+ return {
+ "width": {"default": 1280, "help": "width of the screenshots"},
+ "height": {"default": 720, "help": "height of the screenshots"},
+ }
+
+ def enrich(self, item: Metadata) -> Metadata:
+ url = self.get_url(item)
+ print("enrich")
+ # driver = config.webdriver
+ # with driver as Webdriver(): # TODO: make a util
+ # #TODO: take screenshot
+ # pass
+
+ # logger.debug(f"getting screenshot for {url=}")
+ # key = self._get_key_from_url(url, ".png", append_datetime=True)
+ # filename = os.path.join(Storage.TMP_FOLDER, key)
+
+ # # Accept cookies popup dismiss for ytdlp video
+ # if 'facebook.com' in url:
+ # try:
+ # logger.debug(f'Trying fb click accept cookie popup for {url}')
+ # self.driver.get("http://www.facebook.com")
+ # foo = self.driver.find_element(By.XPATH, "//button[@data-cookiebanner='accept_only_essential_button']")
+ # foo.click()
+ # logger.debug(f'fb click worked')
+ # # linux server needs a sleep otherwise facebook cookie won't have worked and we'll get a popup on next page
+ # time.sleep(2)
+ # except:
+ # logger.warning(f'Failed on fb accept cookies for url {url}')
+
+ # try:
+ # self.driver.get(url)
+ # time.sleep(6)
+ # except TimeoutException:
+ # logger.info("TimeoutException loading page for screenshot")
+
+ # self.driver.save_screenshot(filename)
+ # self.storage.upload(filename, key, extra_args={'ACL': 'public-read', 'ContentType': 'image/png'})
+
+ # cdn_url = self.storage.get_cdn_url(key)
+ # self.add_to_media(cdn_url, key)
+
+ # return cdn_url
diff --git a/src/metadata.py b/src/metadata.py
new file mode 100644
index 0000000..39b62ff
--- /dev/null
+++ b/src/metadata.py
@@ -0,0 +1,30 @@
+
+from __future__ import annotations
+from typing import Union, Dict
+from dataclasses import dataclass
+
+
+@dataclass
+class Metadata:
+ # does not handle files, only primitives
+ # the only piece of logic to handle files is the archiver, enricher, and storage
+ status: str
+ # title: str
+ # url: str
+ # hash: str
+ metadata: Dict[str, Metadata]
+
+ @staticmethod
+ def merge(left: Metadata, right: Metadata, overwrite_left=True) -> Metadata:
+ # should return a merged version of the Metadata
+ # will work for archived() and enriched()
+ # what if 2 metadatas contain the same keys? only one can remain! : overwrite_left
+ pass
+
+ def get(self, key: str) -> Union[Metadata, str]:
+ # goes through metadata and returns the Metadata available
+ pass
+
+ def as_json(self) -> str:
+ # converts all metadata and data into JSON
+ pass
diff --git a/src/orchestrator.py b/src/orchestrator.py
index 30f7b3c..272919f 100644
--- a/src/orchestrator.py
+++ b/src/orchestrator.py
@@ -1,5 +1,5 @@
-from typing import Union, Dict
from __future__ import annotations
+from typing import Union, Dict
from dataclasses import dataclass
"""
@@ -39,31 +39,31 @@ Cisticola considerations:
2. So the auto-archiver becomes like a puzzle and fixes to Cisticola scrapers can immediately benefit it, and contributions are focused on a single source or scraping
"""
-@dataclass
-class Metadata:
- # does not handle files, only primitives
- # the only piece of logic to handle files is the archiver, enricher, and storage
- status: str
- # title: str
- # url: str
- # hash: str
- main_file: Metadata
- metadata: Dict[str, Metadata]
+# @dataclass
+# class Metadata:
+# # does not handle files, only primitives
+# # the only piece of logic to handle files is the archiver, enricher, and storage
+# status: str
+# # title: str
+# # url: str
+# # hash: str
+# main_file: Metadata
+# metadata: Dict[str, Metadata]
- @staticmethod
- def merge(left, right : Metadata, overwrite_left=True) -> Metadata:
- # should return a merged version of the Metadata
- # will work for archived() and enriched()
- # what if 2 metadatas contain the same keys? only one can remain! : overwrite_left
- pass
+# @staticmethod
+# def merge(left, right : Metadata, overwrite_left=True) -> Metadata:
+# # should return a merged version of the Metadata
+# # will work for archived() and enriched()
+# # what if 2 metadatas contain the same keys? only one can remain! : overwrite_left
+# pass
- def get(self, key) -> Union[Metadata, str]:
- # goes through metadata and returns the Metadata available
- pass
+# def get(self, key) -> Union[Metadata, str]:
+# # goes through metadata and returns the Metadata available
+# pass
- def as_json(self) -> str:
- # converts all metadata and data into JSON
- pass
+# def as_json(self) -> str:
+# # converts all metadata and data into JSON
+# pass
"""
@@ -116,27 +116,27 @@ class ArchivingOrchestrator:
# where does that update/processing happen? in config.py
# reflection for Archiver to know wihch child classes it has? use Archiver.__subclasses__
self.archivers = [
- Archiver.init(a, config.get(a))
+ Archiver.init(a, config)
for a in config.archivers
]
- self.enrichments = [
- Enrichment.init(e, config.get(e))
- for e in config.enrichments
+ self.enrichers = [
+ Enricher.init(e, config)
+ for e in config.enrichers
]
self.formatters = [
- Formatter.init(f, config.get(f))
+ Formatter.init(f, config)
for f in config.formatters
]
self.storages = [
- Storage.init(s, config.get(s))
+ Storage.init(s, config)
for s in config.storages
]
self.databases = [
- Database.init(f, config.get(f))
+ Database.init(f, config)
for f in config.formatters
]
@@ -192,11 +192,11 @@ class ArchivingOrchestrator:
# what if an archiver returns multiple entries and one is to be part of HTMLgenerator?
# should it call the HTMLgenerator as if it's not an enrichment?
- # eg: if it is enable: generates an HTML with all the returned media, should it include enrichments? yes
+ # eg: if it is enable: generates an HTML with all the returned media, should it include enrichers? yes
# then how to execute it last? should there also be post-processors? are there other examples?
# maybe as a PDF? or a Markdown file
# side captures: screenshot, wacz, webarchive, thumbnails, HTMLgenerator
- for e in enrichments:
+ for e in enrichers:
result.update(e.enrich(result))
# formatters, enrichers, and storages will sometimes look for specific properties: eg
Screenshot:
diff --git a/src/step.py b/src/step.py
new file mode 100644
index 0000000..d717386
--- /dev/null
+++ b/src/step.py
@@ -0,0 +1,30 @@
+from __future__ import annotations
+from dataclasses import dataclass
+from typing import Type
+from metadata import Metadata
+from abc import ABC
+
+
+@dataclass
+class Step(ABC):
+ name : str = None
+
+ def __init__(self, config: dict) -> None:
+ self.config = self.config[self.name]
+
+ @staticmethod
+ def configs() -> dict: {}
+
+ def init(name: str, config: dict, child: Type[Step]) -> Step:
+ """
+ cannot find subclasses of child.subclasses
+ """
+ for sub in child.__subclasses__():
+ if sub.name == name:
+ return sub.__init__(config)
+ raise f"Unable to initialize class with {name=}"
+
+ def get_url(self, item: Metadata) -> str:
+ url = item.get("url")
+ assert type(url) is str and len(url) > 0
+ return url
diff --git a/src/utils/__init__.py b/src/utils/__init__.py
index 68010ab..baea5e9 100644
--- a/src/utils/__init__.py
+++ b/src/utils/__init__.py
@@ -1,3 +1,4 @@
# we need to explicitly expose the available imports here
from .gworksheet import *
-from .misc import *
\ No newline at end of file
+from .misc import *
+from .util import Util
\ No newline at end of file
diff --git a/src/utils/util.py b/src/utils/util.py
new file mode 100644
index 0000000..9ad5b53
--- /dev/null
+++ b/src/utils/util.py
@@ -0,0 +1,20 @@
+from __future__ import annotations
+from dataclasses import dataclass
+from abc import abstractmethod, ABC
+from metadata import Metadata
+from step import Step
+
+@dataclass
+class Util(Step, ABC):
+ name = "util"
+
+ def __init__(self, config: dict) -> None:
+ Step.__init__(self)
+
+
+ # only for typing...
+ def init(name: str, config: dict) -> Util:
+ return super().init(name, config, Util)
+
+ @abstractmethod
+ def enrich(self, item: Metadata) -> Metadata: pass
diff --git a/src/v2.py b/src/v2.py
new file mode 100644
index 0000000..8fa544f
--- /dev/null
+++ b/src/v2.py
@@ -0,0 +1,9 @@
+
+
+from configs.v2config import ConfigV2
+from orchestrator import ArchivingOrchestrator
+
+config = ConfigV2()
+config.parse()
+
+# orchestrator = ArchivingOrchestrator(config)
\ No newline at end of file
From 618e7ed0a3a70446ce9e3c99f7f7359c26bf057d Mon Sep 17 00:00:00 2001
From: msramalho <19508417+msramalho@users.noreply.github.com>
Date: Thu, 24 Nov 2022 11:53:21 +0000
Subject: [PATCH 022/190] subproperties in config
---
src/configs/v2config.py | 76 ++++++++++++++++++++++++++++++-----------
1 file changed, 56 insertions(+), 20 deletions(-)
diff --git a/src/configs/v2config.py b/src/configs/v2config.py
index 4b072c3..bce5669 100644
--- a/src/configs/v2config.py
+++ b/src/configs/v2config.py
@@ -3,6 +3,7 @@
import argparse, yaml
from dataclasses import dataclass, field
from typing import List
+from feeders.feeder import Feeder
from step import Step
from utils import Util
from enrichers import Enricher
@@ -13,15 +14,16 @@ from collections import defaultdict
class ConfigV2:
# TODO: should Config inherit from Step so it can have it's own configurations?
configurable_parents = [
+ Feeder,
Enricher,
- Util
+ # Util
]
- feeder : Step #TODO:= BaseFeeder
- archivers: List[Step] = field(default_factory=[]) #TODO: fix type
+ feeder: Step # TODO:= BaseFeeder
+ archivers: List[Step] = field(default_factory=[]) # TODO: fix type
enrichers: List[Enricher] = field(default_factory=[])
- formatters: List[Step] = field(default_factory=[]) #TODO: fix type
- storages: List[Step] = field(default_factory=[]) #TODO: fix type
- databases: List[Step] = field(default_factory=[]) #TODO: fix type
+ formatters: List[Step] = field(default_factory=[]) # TODO: fix type
+ storages: List[Step] = field(default_factory=[]) # TODO: fix type
+ databases: List[Step] = field(default_factory=[]) # TODO: fix type
def __init__(self) -> None:
self.defaults = {}
@@ -39,13 +41,27 @@ class ConfigV2:
for configurable in self.configurable_parents:
child: Step
+ # print(f"{configurable=}")
for child in configurable.__subclasses__():
+ # print(f"{child=} {child.configs()=}")
+
for config, details in child.configs().items():
+ print(config, details)
assert "." not in child.name, f"class prop name cannot contain dots('.'): {child.name}"
assert "." not in config, f"config property cannot contain dots('.'): {config}"
+ if (is_nested := type(details["default"]) == dict):
+ for subconfig, subdefault in details["default"].items():
+ assert "." not in subconfig, f"config subproperty cannot contain dots('.'): {subconfig}"
+ config_path = f"{child.name}.{config}.{subconfig}"
+ parser.add_argument(f'--{config_path}', action='store', dest=config_path, help=details['help'] + f"({subconfig})")
+ self.defaults[config_path] = subdefault
+
config_path = f"{child.name}.{config}"
- parser.add_argument(f'--{config_path}', action='store', dest=config_path, help=details['help'])
+ print(config_path)
self.defaults[config_path] = details["default"]
+ if not is_nested:
+ # nested cannot be directly set on the CLI
+ parser.add_argument(f'--{config_path}', action='store', dest=config_path, help=details['help'])
args = parser.parse_args()
@@ -53,28 +69,48 @@ class ConfigV2:
with open(args.config, "r", encoding="utf-8") as inf:
self.yaml_config = yaml.safe_load(inf)
+ # print(f"{self.yaml_config.get('configurations', {})=}")
# 3. CONFIGS: decide value with priority: CLI >> config.yaml >> default
self.config = defaultdict(dict)
for config_path, default in self.defaults.items():
- child, config = tuple(config_path.split("."))
- val = getattr(args, config_path)
- if val is None:
- val = self.yaml_config.get("configurations", {}).get(child, {}).get(config, default)
- self.config[child][config] = val
+ config_steps = config_path.split(".")
+ if len(config_steps) == 2: # not nested
+ child, config = tuple(config_steps)
+ val = getattr(args, config_path, None)
+ if val is None:
+ val = self.yaml_config.get("configurations", {}).get(child, {}).get(config, default)
+ # self.config[child][config] = val
+
+ elif len(config_steps) == 3: # nested
+ child, config, subconfig = tuple(config_steps)
+ val = getattr(args, config_path)
+ if config not in self.config[child]:
+ self.config[child][config] = {}
+ if val is None:
+ val = self.yaml_config.get("configurations", {}).get(child, {}).get(config, {}).get(subconfig, default)
+ print(child, config, subconfig, val)
+ self.config[child][config][subconfig] = val
+
+ # child, config = tuple(config_path.split("."))
+ # # print(config_path)
+ # val = getattr(args, config_path)
+ # # print(child, config, val)
+ # if val is None:
+ # val = self.yaml_config.get("configurations", {}).get(child, {}).get(config, default)
+ # self.config[child][config] = val
self.config = dict(self.config)
# 4. STEPS: read steps and validate they exist
steps = self.yaml_config.get("steps", {})
assert "archivers" in steps, "your configuration steps are missing the archivers property"
assert "storages" in steps, "your configuration steps are missing the storages property"
-
- print(self.config)
-
- # self.feeder = Feeder.init
- self.enrichers = [Enricher.init(steps.get("enrichers", [])[0], self.config)]
-
-
- print(self.enrichers)
+
+ print("config.py", self.config)
+
+ self.feeder = Feeder.init(steps.get("feeder", "cli_feeder"), self.config)
+ self.enrichers = [Enricher.init(e, self.config) for e in steps.get("enrichers", [])]
+
+ print("enrichers", [e for e in self.enrichers])
def validate(self):
pass
From 9dc709d3b90603c205b081847362f19027d123f4 Mon Sep 17 00:00:00 2001
From: msramalho <19508417+msramalho@users.noreply.github.com>
Date: Thu, 24 Nov 2022 15:44:25 +0000
Subject: [PATCH 023/190] demo feeder logic working
---
orchestrate.yaml | 34 ++++++++--
src/archivers/telethon_archiver.py | 2 +-
src/configs/v2config.py | 56 +++++-----------
src/enrichers/enricher.py | 3 +-
src/feeders/__init__.py | 2 +
src/feeders/feeder.py | 23 +++++++
src/feeders/feeder_gsheet.py | 101 +++++++++++++++++++++++++++++
src/orchestrator.py | 54 +++++++--------
src/step.py | 16 +++--
src/utils/__init__.py | 2 +-
src/utils/util.py | 3 +-
src/v2.py | 5 +-
12 files changed, 216 insertions(+), 85 deletions(-)
create mode 100644 src/feeders/__init__.py
create mode 100644 src/feeders/feeder.py
create mode 100644 src/feeders/feeder_gsheet.py
diff --git a/orchestrate.yaml b/orchestrate.yaml
index 689765f..3a2bc27 100644
--- a/orchestrate.yaml
+++ b/orchestrate.yaml
@@ -10,9 +10,9 @@ steps:
- webarchive # this way it runs as a failsafe only
enrichers:
- screenshot
- - wacz
- - webarchive # this way it runs for every case, webarchive extends archiver and enrichment
- - thumbnails
+ # - wacz
+ # - webarchive # this way it runs for every case, webarchive extends archiver and enrichment
+ # - thumbnails
formatters:
- HTMLFormater
- PdfFormater
@@ -29,10 +29,32 @@ configurations:
global:
- save_logs: False
gsheets_feeder:
- - sheet: "Auto archiver"
- - header: "" # defaults to 1 in GSheetsFeeder
- - service_account: "secrets/service_account.json"
+ sheet: auto-archiver-test
+ header: 2 # defaults to 1 in GSheetsFeeder
+ service_account: "secrets/service_account.json"
+ allow_worksheets: "aa-refactor-tests"
+ block_worksheets: "blocked,test-cases-008"
+ columns:
+ 'url': 'link'
+ 'status': 'archive status'
+ 'folder': 'destination folder'
+ 'archive': 'archive location'
+ 'date': 'archive date'
+ 'thumbnail': 'thumbnail'
+ 'thumbnail_index': 'thumbnail index'
+ 'timestamp': 'upload timestamp'
+ 'title': 'upload title'
+ 'duration': 'duration'
+ 'screenshot': 'screenshot'
+ 'hash': 'hash'
+ 'wacz': 'wacz'
+ 'replaywebpage': 'replaywebpage'
tiktok:
+ api_keys:
+ - username: 1
+ password: 2
+ - username: 3
+ password: 4
username: "abc"
password: "123"
token: "here"
diff --git a/src/archivers/telethon_archiver.py b/src/archivers/telethon_archiver.py
index f0ff194..a2cbf0a 100644
--- a/src/archivers/telethon_archiver.py
+++ b/src/archivers/telethon_archiver.py
@@ -17,7 +17,7 @@ class TelethonArchiver(Archiver):
super().__init__(storage, config)
if config.telegram_config:
c = config.telegram_config
- self.client = TelegramClient("./anon", c.api_id, c.api_hash)
+ self.client = TelegramClient("./anon.session", c.api_id, c.api_hash)
self.bot_token = c.bot_token
def _get_media_posts_in_group(self, chat, original_post, max_amp=10):
diff --git a/src/configs/v2config.py b/src/configs/v2config.py
index bce5669..9eb35df 100644
--- a/src/configs/v2config.py
+++ b/src/configs/v2config.py
@@ -27,8 +27,10 @@ class ConfigV2:
def __init__(self) -> None:
self.defaults = {}
+ self.cli_ops = {}
self.config = {}
+ # TODO: make this work for nested props like gsheets_feeder.columns.url = "URL"
def parse(self):
# 1. parse CLI values
parser = argparse.ArgumentParser(
@@ -41,27 +43,15 @@ class ConfigV2:
for configurable in self.configurable_parents:
child: Step
- # print(f"{configurable=}")
for child in configurable.__subclasses__():
- # print(f"{child=} {child.configs()=}")
-
for config, details in child.configs().items():
- print(config, details)
assert "." not in child.name, f"class prop name cannot contain dots('.'): {child.name}"
assert "." not in config, f"config property cannot contain dots('.'): {config}"
- if (is_nested := type(details["default"]) == dict):
- for subconfig, subdefault in details["default"].items():
- assert "." not in subconfig, f"config subproperty cannot contain dots('.'): {subconfig}"
- config_path = f"{child.name}.{config}.{subconfig}"
- parser.add_argument(f'--{config_path}', action='store', dest=config_path, help=details['help'] + f"({subconfig})")
- self.defaults[config_path] = subdefault
-
config_path = f"{child.name}.{config}"
- print(config_path)
+ parser.add_argument(f'--{config_path}', action='store', dest=config_path, help=details['help'])
self.defaults[config_path] = details["default"]
- if not is_nested:
- # nested cannot be directly set on the CLI
- parser.add_argument(f'--{config_path}', action='store', dest=config_path, help=details['help'])
+ if "cli_set" in details:
+ self.cli_ops[config_path] = details["cli_set"]
args = parser.parse_args()
@@ -73,31 +63,14 @@ class ConfigV2:
# 3. CONFIGS: decide value with priority: CLI >> config.yaml >> default
self.config = defaultdict(dict)
for config_path, default in self.defaults.items():
- config_steps = config_path.split(".")
- if len(config_steps) == 2: # not nested
- child, config = tuple(config_steps)
- val = getattr(args, config_path, None)
- if val is None:
- val = self.yaml_config.get("configurations", {}).get(child, {}).get(config, default)
- # self.config[child][config] = val
-
- elif len(config_steps) == 3: # nested
- child, config, subconfig = tuple(config_steps)
- val = getattr(args, config_path)
- if config not in self.config[child]:
- self.config[child][config] = {}
- if val is None:
- val = self.yaml_config.get("configurations", {}).get(child, {}).get(config, {}).get(subconfig, default)
- print(child, config, subconfig, val)
- self.config[child][config][subconfig] = val
-
- # child, config = tuple(config_path.split("."))
- # # print(config_path)
- # val = getattr(args, config_path)
- # # print(child, config, val)
- # if val is None:
- # val = self.yaml_config.get("configurations", {}).get(child, {}).get(config, default)
- # self.config[child][config] = val
+ child, config = tuple(config_path.split("."))
+ val = getattr(args, config_path)
+ if val is not None and config_path in self.cli_ops:
+ val = self.cli_ops[config_path](val, default)
+ if val is None:
+ val = self.yaml_config.get("configurations", {}).get(child, {}).get(config, default)
+ # print(child, config, val)
+ self.config[child][config] = val
self.config = dict(self.config)
# 4. STEPS: read steps and validate they exist
@@ -105,11 +78,12 @@ class ConfigV2:
assert "archivers" in steps, "your configuration steps are missing the archivers property"
assert "storages" in steps, "your configuration steps are missing the storages property"
- print("config.py", self.config)
+ # print("config.py", self.config)
self.feeder = Feeder.init(steps.get("feeder", "cli_feeder"), self.config)
self.enrichers = [Enricher.init(e, self.config) for e in steps.get("enrichers", [])]
+ print("feeder", self.feeder)
print("enrichers", [e for e in self.enrichers])
def validate(self):
diff --git a/src/enrichers/enricher.py b/src/enrichers/enricher.py
index c767b8e..baa22e3 100644
--- a/src/enrichers/enricher.py
+++ b/src/enrichers/enricher.py
@@ -9,7 +9,8 @@ class Enricher(Step, ABC):
name = "enricher"
def __init__(self, config: dict) -> None:
- Step.__init__(self)
+ # without this STEP.__init__ is not called
+ super().__init__(config)
# only for typing...
diff --git a/src/feeders/__init__.py b/src/feeders/__init__.py
new file mode 100644
index 0000000..9fb5942
--- /dev/null
+++ b/src/feeders/__init__.py
@@ -0,0 +1,2 @@
+from.feeder import Feeder
+from .feeder_gsheet import GsheetsFeeder
\ No newline at end of file
diff --git a/src/feeders/feeder.py b/src/feeders/feeder.py
new file mode 100644
index 0000000..6b7ba10
--- /dev/null
+++ b/src/feeders/feeder.py
@@ -0,0 +1,23 @@
+from __future__ import annotations
+from dataclasses import dataclass
+from abc import abstractmethod
+# from metadata import Metadata
+from step import Step
+
+
+@dataclass
+class Feeder(Step):
+ name = "feeder"
+
+ def __init__(self, config: dict) -> None:
+ # without this STEP.__init__ is not called
+ super().__init__(config)
+
+ def init(name: str, config: dict) -> Feeder:
+ # only for code typing
+ return Step.init(name, config, Feeder)
+
+ # def feed(self, item: Metadata) -> Metadata: pass
+
+ @abstractmethod
+ def __iter__(self) -> Feeder: return None
\ No newline at end of file
diff --git a/src/feeders/feeder_gsheet.py b/src/feeders/feeder_gsheet.py
new file mode 100644
index 0000000..7ebc640
--- /dev/null
+++ b/src/feeders/feeder_gsheet.py
@@ -0,0 +1,101 @@
+import json, gspread
+
+# from metadata import Metadata
+from loguru import logger
+
+# from . import Enricher
+from feeders.feeder import Feeder
+from utils import GWorksheet
+
+
+class GsheetsFeeder(Feeder):
+ name = "gsheets_feeder"
+
+ def __init__(self, config: dict) -> None:
+ # without this STEP.__init__ is not called
+ super().__init__(config)
+ self.gsheets_client = gspread.service_account(filename=self.service_account)
+ assert type(self.header) == int, f"header ({self.header}) value must be an integer not {type(self.header)}"
+
+ @staticmethod
+ def configs() -> dict:
+ return {
+ "sheet": {"default": None, "help": "name of the sheet to archive"},
+ "header": {"default": 1, "help": "index of the header row (starts at 1)"},
+ "service_account": {"default": "secrets/service_account.json", "help": "service account JSON file path"},
+ "allow_worksheets": {
+ "default": set(),
+ "help": "(CSV) only worksheets whose name is included in allow are included (overrides worksheet_block), leave empty so all are allowed",
+ "cli_set": lambda cli_val, cur_val: set(cli_val.split(","))
+ },
+ "block_worksheets": {
+ "default": set(),
+ "help": "(CSV) explicitly block some worksheets from being processed, defaults to empty",
+ "cli_set": lambda cli_val, cur_val: set(cli_val.split(","))
+ },
+ "columns": {
+ "default": {
+ 'url': 'link',
+ 'status': 'archive status',
+ 'folder': 'destination folder',
+ 'archive': 'archive location',
+ 'date': 'archive date',
+ 'thumbnail': 'thumbnail',
+ 'thumbnail_index': 'thumbnail index',
+ 'timestamp': 'upload timestamp',
+ 'title': 'upload title',
+ 'duration': 'duration',
+ 'screenshot': 'screenshot',
+ 'hash': 'hash',
+ 'wacz': 'wacz',
+ 'replaywebpage': 'replaywebpage',
+ },
+ "help": "names of columns in the google sheet",
+ "cli_set": lambda cli_val, cur_val: dict(cur_val, **json.loads(cli_val))
+ },
+ }
+ def __iter__(self) -> str:
+ sh = self.gsheets_client.open(self.sheet)
+ for ii, wks in enumerate(sh.worksheets()):
+ if not self.should_process_sheet(wks.title):
+ logger.debug(f"SKIPPED worksheet '{wks.title}' due to allow/block rules")
+ continue
+
+ logger.info(f'Opening worksheet {ii=}: {wks.title=} header={self.header}')
+ gw = GWorksheet(wks, header_row=self.header, columns=self.columns)
+
+ if len(missing_cols := self.missing_required_columns(gw)):
+ logger.warning(f"SKIPPED worksheet '{wks.title}' due to missing required column(s) for {missing_cols}")
+ continue
+
+ for row in range(1 + self.header, gw.count_rows() + 1):
+ url = gw.get_cell(row, 'url').strip()
+ if not len(url): continue
+ #TODO: gsheet_db should check later if this is supposed to be archived
+ # static_status = gw.get_cell(row, 'status')
+ # status = gw.get_cell(row, 'status', fresh=static_status in ['', None] and url != '')
+ # All checks done - archival process starts here
+ yield url
+ logger.success(f'Finished worksheet {wks.title}')
+
+ # GWorksheet(self.sheet)
+ print(self.sheet)
+ for u in ["url1", "url2"]:
+ yield u
+
+
+ def should_process_sheet(self, sheet_name: str) -> bool:
+ if len(self.allow_worksheets) and sheet_name not in self.allow_worksheets:
+ # ALLOW rules exist AND sheet name not explicitly allowed
+ return False
+ if len(self.block_worksheets) and sheet_name in self.block_worksheets:
+ # BLOCK rules exist AND sheet name is blocked
+ return False
+ return True
+
+ def missing_required_columns(self, gw: GWorksheet) -> list:
+ missing = []
+ for required_col in ['url', 'status']:
+ if not gw.col_exists(required_col):
+ missing.append(required_col)
+ return missing
diff --git a/src/orchestrator.py b/src/orchestrator.py
index 272919f..f32f4c9 100644
--- a/src/orchestrator.py
+++ b/src/orchestrator.py
@@ -2,6 +2,8 @@ from __future__ import annotations
from typing import Union, Dict
from dataclasses import dataclass
+from enrichers.enricher import Enricher
+
"""
how not to couple the different pieces of logic
due to the use of constants for the metadata keys?
@@ -110,49 +112,47 @@ class ArchivingOrchestrator:
def __init__(self, config) -> None:
# in config.py we should test that the archivers exist and log mismatches (blocking execution)
# identify each formatter, storage, database, etc
- self.feeder = Feeder.init(config.feeder, config.get(config.feeder))
+ # self.feeder = Feeder.init(config.feeder, config.get(config.feeder))
# Is it possible to overwrite config.yaml values? it could be useful: share config file and modify gsheets_feeder.sheet via CLI
# where does that update/processing happen? in config.py
# reflection for Archiver to know wihch child classes it has? use Archiver.__subclasses__
- self.archivers = [
- Archiver.init(a, config)
- for a in config.archivers
- ]
+ # self.archivers = [
+ # Archiver.init(a, config)
+ # for a in config.archivers
+ # ]
+ self.feeder = config.feeder
+ self.enrichers = config.enrichers
- self.enrichers = [
- Enricher.init(e, config)
- for e in config.enrichers
- ]
+ # self.formatters = [
+ # Formatter.init(f, config)
+ # for f in config.formatters
+ # ]
- self.formatters = [
- Formatter.init(f, config)
- for f in config.formatters
- ]
+ # self.storages = [
+ # Storage.init(s, config)
+ # for s in config.storages
+ # ]
- self.storages = [
- Storage.init(s, config)
- for s in config.storages
- ]
-
- self.databases = [
- Database.init(f, config)
- for f in config.formatters
- ]
+ # self.databases = [
+ # Database.init(f, config)
+ # for f in config.formatters
+ # ]
# these rules are checked in config.py
- assert len(archivers) > 1, "there needs to be at least one Archiver"
+ # assert len(archivers) > 1, "there needs to be at least one Archiver"
- def feed(self, feeder: Feeder) -> list(ArchiveResult):
- for next in feeder:
- self.archive(next)
+ def feed(self) -> list(ArchiveResult):
+ for url in self.feeder:
+ print("ARCHIVING", url)
+ # self.archive(url)
# how does this handle the parameters like folder which can be different for each archiver?
# the storage needs to know where to archive!!
# solution: feeders have context: extra metadata that they can read or ignore,
# all of it should have sensible defaults (eg: folder)
# default feeder is a list with 1 element
- def archive(url) -> Union[ArchiveResult, None]:
+ def archive(self, url) -> Union[ArchiveResult, None]:
url = clear_url(url)
result = Metadata(url=url)
diff --git a/src/step.py b/src/step.py
index d717386..04d7a61 100644
--- a/src/step.py
+++ b/src/step.py
@@ -1,16 +1,21 @@
from __future__ import annotations
-from dataclasses import dataclass
+from dataclasses import dataclass, field
+from inspect import ClassFoundException
from typing import Type
from metadata import Metadata
from abc import ABC
+# from collections.abc import Iterable
@dataclass
class Step(ABC):
- name : str = None
+ name: str = None
def __init__(self, config: dict) -> None:
- self.config = self.config[self.name]
+ # reads the configs into object properties
+ # self.config = config[self.name]
+ for k, v in config[self.name].items():
+ self.__setattr__(k, v)
@staticmethod
def configs() -> dict: {}
@@ -21,8 +26,9 @@ class Step(ABC):
"""
for sub in child.__subclasses__():
if sub.name == name:
- return sub.__init__(config)
- raise f"Unable to initialize class with {name=}"
+ print(sub.name, "CALLING NEW")
+ return sub(config)
+ raise ClassFoundException(f"Unable to initialize STEP with {name=}")
def get_url(self, item: Metadata) -> str:
url = item.get("url")
diff --git a/src/utils/__init__.py b/src/utils/__init__.py
index baea5e9..ad56f36 100644
--- a/src/utils/__init__.py
+++ b/src/utils/__init__.py
@@ -1,4 +1,4 @@
# we need to explicitly expose the available imports here
-from .gworksheet import *
+from .gworksheet import GWorksheet
from .misc import *
from .util import Util
\ No newline at end of file
diff --git a/src/utils/util.py b/src/utils/util.py
index 9ad5b53..51bb2e3 100644
--- a/src/utils/util.py
+++ b/src/utils/util.py
@@ -11,8 +11,7 @@ class Util(Step, ABC):
def __init__(self, config: dict) -> None:
Step.__init__(self)
-
- # only for typing...
+ # only for typing...
def init(name: str, config: dict) -> Util:
return super().init(name, config, Util)
diff --git a/src/v2.py b/src/v2.py
index 8fa544f..8ecb820 100644
--- a/src/v2.py
+++ b/src/v2.py
@@ -1,9 +1,12 @@
+from abc import ABC
from configs.v2config import ConfigV2
from orchestrator import ArchivingOrchestrator
config = ConfigV2()
config.parse()
-# orchestrator = ArchivingOrchestrator(config)
\ No newline at end of file
+orchestrator = ArchivingOrchestrator(config)
+
+orchestrator.feed()
From 955891a411cb2bd96a477f3751472776995b101a Mon Sep 17 00:00:00 2001
From: msramalho <19508417+msramalho@users.noreply.github.com>
Date: Sat, 10 Dec 2022 12:03:46 +0000
Subject: [PATCH 024/190] WIP feeder
---
orchestrate.yaml | 8 +++---
src/configs/v2config.py | 2 +-
src/databases/database.py | 21 ++++++++++++++
src/enrichers/enricher.py | 2 +-
src/feeders/feeder.py | 2 +-
src/feeders/feeder_gsheet.py | 54 ++++++++++++------------------------
src/steps/gsheet.py | 42 ++++++++++++++++++++++++++++
src/{ => steps}/step.py | 0
src/utils/util.py | 2 +-
9 files changed, 88 insertions(+), 45 deletions(-)
create mode 100644 src/databases/database.py
create mode 100644 src/steps/gsheet.py
rename src/{ => steps}/step.py (100%)
diff --git a/orchestrate.yaml b/orchestrate.yaml
index 3a2bc27..9626e83 100644
--- a/orchestrate.yaml
+++ b/orchestrate.yaml
@@ -3,11 +3,11 @@ steps:
# a feeder could be in an "infinite loop" for example: gsheets_infinite feeder which holds-> this could be an easy logic addiction by modifying for each to while not feeder.done() if it becomes necessary
feeder: gsheets_feeder # default -> only expects URL from CLI
archivers: # order matters
- - tiktok
- telethon
- - twitter
- - instagram
- - webarchive # this way it runs as a failsafe only
+ # - tiktok
+ # - twitter
+ # - instagram
+ # - webarchive # this way it runs as a failsafe only
enrichers:
- screenshot
# - wacz
diff --git a/src/configs/v2config.py b/src/configs/v2config.py
index 9eb35df..50c8b0f 100644
--- a/src/configs/v2config.py
+++ b/src/configs/v2config.py
@@ -4,7 +4,7 @@ import argparse, yaml
from dataclasses import dataclass, field
from typing import List
from feeders.feeder import Feeder
-from step import Step
+from steps.step import Step
from utils import Util
from enrichers import Enricher
from collections import defaultdict
diff --git a/src/databases/database.py b/src/databases/database.py
new file mode 100644
index 0000000..15f8d0d
--- /dev/null
+++ b/src/databases/database.py
@@ -0,0 +1,21 @@
+from __future__ import annotations
+from dataclasses import dataclass
+from abc import abstractmethod, ABC
+from metadata import Metadata
+from steps.step import Step
+
+@dataclass
+class Database(Step, ABC):
+ name = "database"
+
+ def __init__(self, config: dict) -> None:
+ # without this STEP.__init__ is not called
+ super().__init__(config)
+
+
+ # only for typing...
+ def init(name: str, config: dict) -> Database:
+ return Step.init(name, config, Database)
+
+ @abstractmethod
+ def enrich(self, item: Metadata) -> Metadata: pass
diff --git a/src/enrichers/enricher.py b/src/enrichers/enricher.py
index baa22e3..faf43d8 100644
--- a/src/enrichers/enricher.py
+++ b/src/enrichers/enricher.py
@@ -2,7 +2,7 @@ from __future__ import annotations
from dataclasses import dataclass
from abc import abstractmethod, ABC
from metadata import Metadata
-from step import Step
+from steps.step import Step
@dataclass
class Enricher(Step, ABC):
diff --git a/src/feeders/feeder.py b/src/feeders/feeder.py
index 6b7ba10..d930ba0 100644
--- a/src/feeders/feeder.py
+++ b/src/feeders/feeder.py
@@ -2,7 +2,7 @@ from __future__ import annotations
from dataclasses import dataclass
from abc import abstractmethod
# from metadata import Metadata
-from step import Step
+from steps.step import Step
@dataclass
diff --git a/src/feeders/feeder_gsheet.py b/src/feeders/feeder_gsheet.py
index 7ebc640..a99376f 100644
--- a/src/feeders/feeder_gsheet.py
+++ b/src/feeders/feeder_gsheet.py
@@ -5,10 +5,11 @@ from loguru import logger
# from . import Enricher
from feeders.feeder import Feeder
+from steps.gsheet import Gsheets
from utils import GWorksheet
-class GsheetsFeeder(Feeder):
+class GsheetsFeeder(Gsheets, Feeder):
name = "gsheets_feeder"
def __init__(self, config: dict) -> None:
@@ -19,41 +20,21 @@ class GsheetsFeeder(Feeder):
@staticmethod
def configs() -> dict:
- return {
- "sheet": {"default": None, "help": "name of the sheet to archive"},
- "header": {"default": 1, "help": "index of the header row (starts at 1)"},
- "service_account": {"default": "secrets/service_account.json", "help": "service account JSON file path"},
- "allow_worksheets": {
- "default": set(),
- "help": "(CSV) only worksheets whose name is included in allow are included (overrides worksheet_block), leave empty so all are allowed",
- "cli_set": lambda cli_val, cur_val: set(cli_val.split(","))
- },
- "block_worksheets": {
- "default": set(),
- "help": "(CSV) explicitly block some worksheets from being processed, defaults to empty",
- "cli_set": lambda cli_val, cur_val: set(cli_val.split(","))
- },
- "columns": {
- "default": {
- 'url': 'link',
- 'status': 'archive status',
- 'folder': 'destination folder',
- 'archive': 'archive location',
- 'date': 'archive date',
- 'thumbnail': 'thumbnail',
- 'thumbnail_index': 'thumbnail index',
- 'timestamp': 'upload timestamp',
- 'title': 'upload title',
- 'duration': 'duration',
- 'screenshot': 'screenshot',
- 'hash': 'hash',
- 'wacz': 'wacz',
- 'replaywebpage': 'replaywebpage',
+ return dict(
+ Gsheets.configs(),
+ ** {
+ "allow_worksheets": {
+ "default": set(),
+ "help": "(CSV) only worksheets whose name is included in allow are included (overrides worksheet_block), leave empty so all are allowed",
+ "cli_set": lambda cli_val, cur_val: set(cli_val.split(","))
},
- "help": "names of columns in the google sheet",
- "cli_set": lambda cli_val, cur_val: dict(cur_val, **json.loads(cli_val))
- },
- }
+ "block_worksheets": {
+ "default": set(),
+ "help": "(CSV) explicitly block some worksheets from being processed, defaults to empty",
+ "cli_set": lambda cli_val, cur_val: set(cli_val.split(","))
+ }
+ })
+
def __iter__(self) -> str:
sh = self.gsheets_client.open(self.sheet)
for ii, wks in enumerate(sh.worksheets()):
@@ -71,7 +52,7 @@ class GsheetsFeeder(Feeder):
for row in range(1 + self.header, gw.count_rows() + 1):
url = gw.get_cell(row, 'url').strip()
if not len(url): continue
- #TODO: gsheet_db should check later if this is supposed to be archived
+ # TODO: gsheet_db should check later if this is supposed to be archived
# static_status = gw.get_cell(row, 'status')
# status = gw.get_cell(row, 'status', fresh=static_status in ['', None] and url != '')
# All checks done - archival process starts here
@@ -83,7 +64,6 @@ class GsheetsFeeder(Feeder):
for u in ["url1", "url2"]:
yield u
-
def should_process_sheet(self, sheet_name: str) -> bool:
if len(self.allow_worksheets) and sheet_name not in self.allow_worksheets:
# ALLOW rules exist AND sheet name not explicitly allowed
diff --git a/src/steps/gsheet.py b/src/steps/gsheet.py
new file mode 100644
index 0000000..9654da4
--- /dev/null
+++ b/src/steps/gsheet.py
@@ -0,0 +1,42 @@
+import json, gspread
+
+from loguru import logger
+from steps.step import Step
+
+
+class Gsheets(Step):
+ name = "gsheets"
+
+ def __init__(self, config: dict) -> None:
+ # without this STEP.__init__ is not called
+ super().__init__(config)
+ self.gsheets_client = gspread.service_account(filename=self.service_account)
+ assert type(self.header) == int, f"header ({self.header}) value must be an integer not {type(self.header)}"
+
+ @staticmethod
+ def configs() -> dict:
+ return {
+ "sheet": {"default": None, "help": "name of the sheet to archive"},
+ "header": {"default": 1, "help": "index of the header row (starts at 1)"},
+ "service_account": {"default": "secrets/service_account.json", "help": "service account JSON file path"},
+ "columns": {
+ "default": {
+ 'url': 'link',
+ 'status': 'archive status',
+ 'folder': 'destination folder',
+ 'archive': 'archive location',
+ 'date': 'archive date',
+ 'thumbnail': 'thumbnail',
+ 'thumbnail_index': 'thumbnail index',
+ 'timestamp': 'upload timestamp',
+ 'title': 'upload title',
+ 'duration': 'duration',
+ 'screenshot': 'screenshot',
+ 'hash': 'hash',
+ 'wacz': 'wacz',
+ 'replaywebpage': 'replaywebpage',
+ },
+ "help": "names of columns in the google sheet",
+ "cli_set": lambda cli_val, cur_val: dict(cur_val, **json.loads(cli_val))
+ },
+ }
\ No newline at end of file
diff --git a/src/step.py b/src/steps/step.py
similarity index 100%
rename from src/step.py
rename to src/steps/step.py
diff --git a/src/utils/util.py b/src/utils/util.py
index 51bb2e3..714d499 100644
--- a/src/utils/util.py
+++ b/src/utils/util.py
@@ -2,7 +2,7 @@ from __future__ import annotations
from dataclasses import dataclass
from abc import abstractmethod, ABC
from metadata import Metadata
-from step import Step
+from steps.step import Step
@dataclass
class Util(Step, ABC):
From b3860cfec10f5f9924d146ca1c819d9f2e9aef3f Mon Sep 17 00:00:00 2001
From: msramalho <19508417+msramalho@users.noreply.github.com>
Date: Wed, 14 Dec 2022 14:01:39 +0000
Subject: [PATCH 025/190] telethon join channels working
---
.gitignore | 3 +-
Pipfile | 1 +
Pipfile.lock | 457 ++++++++++--------
README.md | 6 +
...estrate.yaml => orchestration.example.yaml | 22 +-
src/archivers/__init__.py | 5 +-
src/archivers/archiver.py | 26 +
src/archivers/telethon_archiverv2.py | 114 +++++
src/configs/v2config.py | 11 +-
src/enrichers/enricher_screenshot.py | 49 +-
src/feeders/feeder_gsheet.py | 4 +-
src/metadata.py | 22 +-
src/orchestrator.py | 59 ++-
src/steps/gsheet.py | 2 +-
src/utils/__init__.py | 3 +-
src/utils/util.py | 5 +-
src/utils/webdriver.py | 45 ++
17 files changed, 539 insertions(+), 295 deletions(-)
rename orchestrate.yaml => orchestration.example.yaml (80%)
create mode 100644 src/archivers/archiver.py
create mode 100644 src/archivers/telethon_archiverv2.py
create mode 100644 src/utils/webdriver.py
diff --git a/.gitignore b/.gitignore
index 59ed096..88ccd0e 100644
--- a/.gitignore
+++ b/.gitignore
@@ -23,4 +23,5 @@ secrets/*
browsertrix/*
browsertrix-tmp/*
instaloader/*
-instaloader.session
\ No newline at end of file
+instaloader.session
+orchestration.yaml
\ No newline at end of file
diff --git a/Pipfile b/Pipfile
index aa04ea4..2095f2b 100644
--- a/Pipfile
+++ b/Pipfile
@@ -26,6 +26,7 @@ dateparser = "*"
vk-url-scraper = "*"
python-twitter-v2 = "*"
instaloader = "*"
+tqdm = "*"
[requires]
python_version = "3.9"
diff --git a/Pipfile.lock b/Pipfile.lock
index 6aac097..5bfeba7 100644
--- a/Pipfile.lock
+++ b/Pipfile.lock
@@ -1,7 +1,7 @@
{
"_meta": {
"hash": {
- "sha256": "bd987e7237c7e32d2dffb295db633f5a022ce1a718435d11d8ac303c9e37a4d3"
+ "sha256": "60b8f39d7a466e194c98a3fb6a03f74f03b108f5fac4cce8657c5ffdf6a02962"
},
"pipfile-spec": 6,
"requires": {
@@ -57,22 +57,24 @@
},
"boto3": {
"hashes": [
- "sha256:3b0fa19390895e664045713f2e47e63ad29c9f98b7bee6836dec7124953e48b8",
- "sha256:9feb98e045736f943c2099d955415cfe44133e03d8e2d7581d2e5dc74d0ed064"
+ "sha256:53badfc5f145b8a3f9117512b41bc5a64db1cce1b549061d8edba68909e63fdf",
+ "sha256:548081a0f8854bb2eea1e368ab29945478105f56989546f653c75528dcb07d88"
],
"index": "pypi",
- "version": "==1.26.1"
+ "version": "==1.26.28"
},
"botocore": {
"hashes": [
- "sha256:75c65130ffab527d0a3d948c6d87eb8eac210e079e1ff2768c66484be57bb77c",
- "sha256:e38b7cdce927cefabe45608dde61660b76458fba6624240dcdb6c4b8453d17f7"
+ "sha256:982732e7ed65cb6ed11ea3ce0e32dff2bcd465836c32376154f0802aa0a112c7",
+ "sha256:f0b8bb976e368dea20a960b47169e31fc0828feb6f0b9f59f1e5be8d08919b10"
],
"markers": "python_version >= '3.7'",
- "version": "==1.29.1"
+ "version": "==1.29.28"
},
"brotli": {
"hashes": [
+ "sha256:02177603aaca36e1fd21b091cb742bb3b305a569e2402f1ca38af471777fb019",
+ "sha256:11d3283d89af7033236fa4e73ec2cbe743d4f6a81d41bd234f24bf63dde979df",
"sha256:12effe280b8ebfd389022aa65114e30407540ccb89b177d3fbc9a4f177c4bd5d",
"sha256:160c78292e98d21e73a4cc7f76a234390e516afcd982fa17e1422f7c6a9ce9c8",
"sha256:16d528a45c2e1909c2798f27f7bf0a3feec1dc9e50948e738b961618e38b6a7b",
@@ -83,9 +85,15 @@
"sha256:26d168aac4aaec9a4394221240e8a5436b5634adc3cd1cdf637f6645cecbf181",
"sha256:29d1d350178e5225397e28ea1b7aca3648fcbab546d20e7475805437bfb0a130",
"sha256:2aad0e0baa04517741c9bb5b07586c642302e5fb3e75319cb62087bd0995ab19",
+ "sha256:3148362937217b7072cf80a2dcc007f09bb5ecb96dae4617316638194113d5be",
+ "sha256:330e3f10cd01da535c70d09c4283ba2df5fb78e915bea0a28becad6e2ac010be",
+ "sha256:336b40348269f9b91268378de5ff44dc6fbaa2268194f85177b53463d313842a",
"sha256:3496fc835370da351d37cada4cf744039616a6db7d13c430035e901443a34daa",
"sha256:35a3edbe18e876e596553c4007a087f8bcfd538f19bc116917b3c7522fca0429",
"sha256:3b78a24b5fd13c03ee2b7b86290ed20efdc95da75a3557cc06811764d5ad1126",
+ "sha256:3b8b09a16a1950b9ef495a0f8b9d0a87599a9d1f179e2d4ac014b2ec831f87e7",
+ "sha256:3c1306004d49b84bd0c4f90457c6f57ad109f5cc6067a9664e12b7b79a9948ad",
+ "sha256:3ffaadcaeafe9d30a7e4e1e97ad727e4f5610b9fa2f7551998471e3736738679",
"sha256:40d15c79f42e0a2c72892bf407979febd9cf91f36f495ffb333d1d04cebb34e4",
"sha256:44bb8ff420c1d19d91d79d8c3574b8954288bdff0273bf788954064d260d7ab0",
"sha256:4688c1e42968ba52e57d8670ad2306fe92e0169c6f3af0089be75bbac0c64a3b",
@@ -95,6 +103,7 @@
"sha256:56d027eace784738457437df7331965473f2c0da2c70e1a1f6fdbae5402e0389",
"sha256:5913a1177fc36e30fcf6dc868ce23b0453952c78c04c266d3149b3d39e1410d6",
"sha256:5b6ef7d9f9c38292df3690fe3e302b5b530999fa90014853dcd0d6902fb59f26",
+ "sha256:5bf37a08493232fbb0f8229f1824b366c2fc1d02d64e7e918af40acd15f3e337",
"sha256:5cb1e18167792d7d21e21365d7650b72d5081ed476123ff7b8cac7f45189c0c7",
"sha256:61a7ee1f13ab913897dac7da44a73c6d44d48a4adff42a5701e3239791c96e14",
"sha256:622a231b08899c864eb87e85f81c75e7b9ce05b001e59bbfbf43d4a71f5f32b2",
@@ -102,6 +111,7 @@
"sha256:6b2ae9f5f67f89aade1fab0f7fd8f2832501311c363a21579d02defa844d9296",
"sha256:6c772d6c0a79ac0f414a9f8947cc407e119b8598de7621f39cacadae3cf57d12",
"sha256:6d847b14f7ea89f6ad3c9e3901d1bc4835f6b390a9c71df999b0162d9bb1e20f",
+ "sha256:73fd30d4ce0ea48010564ccee1a26bfe39323fde05cb34b5863455629db61dc7",
"sha256:76ffebb907bec09ff511bb3acc077695e2c32bc2142819491579a695f77ffd4d",
"sha256:7bbff90b63328013e1e8cb50650ae0b9bac54ffb4be6104378490193cd60f85a",
"sha256:7cb81373984cc0e4682f31bc3d6be9026006d96eecd07ea49aafb06897746452",
@@ -111,6 +121,7 @@
"sha256:87fdccbb6bb589095f413b1e05734ba492c962b4a45a13ff3408fa44ffe6479b",
"sha256:88c63a1b55f352b02c6ffd24b15ead9fc0e8bf781dbe070213039324922a2eea",
"sha256:8a674ac10e0a87b683f4fa2b6fa41090edfd686a6524bd8dedbd6138b309175c",
+ "sha256:8ed6a5b3d23ecc00ea02e1ed8e0ff9a08f4fc87a1f58a2530e71c0f48adf882f",
"sha256:93130612b837103e15ac3f9cbacb4613f9e348b58b3aad53721d92e57f96d46a",
"sha256:9744a863b489c79a73aba014df554b0e7a0fc44ef3f8a0ef2a52919c7d155031",
"sha256:9749a124280a0ada4187a6cfd1ffd35c350fb3af79c706589d98e088c5044267",
@@ -120,15 +131,24 @@
"sha256:9ed4c92a0665002ff8ea852353aeb60d9141eb04109e88928026d3c8a9e5433c",
"sha256:a72661af47119a80d82fa583b554095308d6a4c356b2a554fdc2799bc19f2a43",
"sha256:afde17ae04d90fbe53afb628f7f2d4ca022797aa093e809de5c3cf276f61bbfa",
+ "sha256:b1375b5d17d6145c798661b67e4ae9d5496920d9265e2f00f1c2c0b5ae91fbde",
"sha256:b336c5e9cf03c7be40c47b5fd694c43c9f1358a80ba384a21969e0b4e66a9b17",
+ "sha256:b3523f51818e8f16599613edddb1ff924eeb4b53ab7e7197f85cbc321cdca32f",
+ "sha256:b43775532a5904bc938f9c15b77c613cb6ad6fb30990f3b0afaea82797a402d8",
"sha256:b663f1e02de5d0573610756398e44c130add0eb9a3fc912a09665332942a2efb",
"sha256:b83bb06a0192cccf1eb8d0a28672a1b79c74c3a8a5f2619625aeb6f28b3a82bb",
+ "sha256:ba72d37e2a924717990f4d7482e8ac88e2ef43fb95491eb6e0d124d77d2a150d",
"sha256:c2415d9d082152460f2bd4e382a1e85aed233abc92db5a3880da2257dc7daf7b",
"sha256:c83aa123d56f2e060644427a882a36b3c12db93727ad7a7b9efd7d7f3e9cc2c4",
+ "sha256:c8e521a0ce7cf690ca84b8cc2272ddaf9d8a50294fd086da67e517439614c755",
+ "sha256:cab1b5964b39607a66adbba01f1c12df2e55ac36c81ec6ed44f2fca44178bf1a",
+ "sha256:cb02ed34557afde2d2da68194d12f5719ee96cfb2eacc886352cb73e3808fc5d",
+ "sha256:cc0283a406774f465fb45ec7efb66857c09ffefbe49ec20b7882eff6d3c86d3a",
"sha256:cfc391f4429ee0a9370aa93d812a52e1fee0f37a81861f4fdd1f4fb28e8547c3",
"sha256:db844eb158a87ccab83e868a762ea8024ae27337fc7ddcbfcddd157f841fdfe7",
"sha256:defed7ea5f218a9f2336301e6fd379f55c655bea65ba2476346340a0ce6f74a1",
"sha256:e16eb9541f3dd1a3e92b89005e37b1257b157b7256df0e36bd7b33b50be73bcb",
+ "sha256:e1abbeef02962596548382e393f56e4c94acd286bd0c5afba756cffc33670e8a",
"sha256:e23281b9a08ec338469268f98f194658abfb13658ee98e2b7f85ee9dd06caa91",
"sha256:e2d9e1cbc1b25e22000328702b014227737756f4b5bf5c485ac1d8091ada078b",
"sha256:e48f4234f2469ed012a98f4b7874e7f7e173c167bed4934912a29e03167cf6b1",
@@ -249,10 +269,10 @@
},
"cloudscraper": {
"hashes": [
- "sha256:152fa9f9db5f19f4ada7e75623e93f45d05bfd3fb29d9cae84f29173a2591530",
- "sha256:59d964acded1a63336b3ce4daf3f2dfed3de7c88f6bf4d904c661b0b4e1b5f5e"
+ "sha256:5f0cde23774270e8a092de68e0fbd68e17854c767fc2d4042a91bda9e4816871",
+ "sha256:ec30da6cee60d0a95e898d9b3aaf09291a0d8b6cf751e86c6f3420b699a00091"
],
- "version": "==1.2.64"
+ "version": "==1.2.66"
},
"commonmark": {
"hashes": [
@@ -263,35 +283,35 @@
},
"cryptography": {
"hashes": [
- "sha256:068147f32fa662c81aebab95c74679b401b12b57494872886eb5c1139250ec5d",
- "sha256:06fc3cc7b6f6cca87bd56ec80a580c88f1da5306f505876a71c8cfa7050257dd",
- "sha256:25c1d1f19729fb09d42e06b4bf9895212292cb27bb50229f5aa64d039ab29146",
- "sha256:402852a0aea73833d982cabb6d0c3bb582c15483d29fb7085ef2c42bfa7e38d7",
- "sha256:4e269dcd9b102c5a3d72be3c45d8ce20377b8076a43cbed6f660a1afe365e436",
- "sha256:5419a127426084933076132d317911e3c6eb77568a1ce23c3ac1e12d111e61e0",
- "sha256:554bec92ee7d1e9d10ded2f7e92a5d70c1f74ba9524947c0ba0c850c7b011828",
- "sha256:5e89468fbd2fcd733b5899333bc54d0d06c80e04cd23d8c6f3e0542358c6060b",
- "sha256:65535bc550b70bd6271984d9863a37741352b4aad6fb1b3344a54e6950249b55",
- "sha256:6ab9516b85bebe7aa83f309bacc5f44a61eeb90d0b4ec125d2d003ce41932d36",
- "sha256:6addc3b6d593cd980989261dc1cce38263c76954d758c3c94de51f1e010c9a50",
- "sha256:728f2694fa743a996d7784a6194da430f197d5c58e2f4e278612b359f455e4a2",
- "sha256:785e4056b5a8b28f05a533fab69febf5004458e20dad7e2e13a3120d8ecec75a",
- "sha256:78cf5eefac2b52c10398a42765bfa981ce2372cbc0457e6bf9658f41ec3c41d8",
- "sha256:7f836217000342d448e1c9a342e9163149e45d5b5eca76a30e84503a5a96cab0",
- "sha256:8d41a46251bf0634e21fac50ffd643216ccecfaf3701a063257fe0b2be1b6548",
- "sha256:984fe150f350a3c91e84de405fe49e688aa6092b3525f407a18b9646f6612320",
- "sha256:9b24bcff7853ed18a63cfb0c2b008936a9554af24af2fb146e16d8e1aed75748",
- "sha256:b1b35d9d3a65542ed2e9d90115dfd16bbc027b3f07ee3304fc83580f26e43249",
- "sha256:b1b52c9e5f8aa2b802d48bd693190341fae201ea51c7a167d69fc48b60e8a959",
- "sha256:bbf203f1a814007ce24bd4d51362991d5cb90ba0c177a9c08825f2cc304d871f",
- "sha256:be243c7e2bfcf6cc4cb350c0d5cdf15ca6383bbcb2a8ef51d3c9411a9d4386f0",
- "sha256:bfbe6ee19615b07a98b1d2287d6a6073f734735b49ee45b11324d85efc4d5cbd",
- "sha256:c46837ea467ed1efea562bbeb543994c2d1f6e800785bd5a2c98bc096f5cb220",
- "sha256:dfb4f4dd568de1b6af9f4cda334adf7d72cf5bc052516e1b2608b683375dd95c",
- "sha256:ed7b00096790213e09eb11c97cc6e2b757f15f3d2f85833cd2d3ec3fe37c1722"
+ "sha256:0e70da4bdff7601b0ef48e6348339e490ebfb0cbe638e083c9c41fb49f00c8bd",
+ "sha256:10652dd7282de17990b88679cb82f832752c4e8237f0c714be518044269415db",
+ "sha256:175c1a818b87c9ac80bb7377f5520b7f31b3ef2a0004e2420319beadedb67290",
+ "sha256:1d7e632804a248103b60b16fb145e8df0bc60eed790ece0d12efe8cd3f3e7744",
+ "sha256:1f13ddda26a04c06eb57119caf27a524ccae20533729f4b1e4a69b54e07035eb",
+ "sha256:2ec2a8714dd005949d4019195d72abed84198d877112abb5a27740e217e0ea8d",
+ "sha256:2fa36a7b2cc0998a3a4d5af26ccb6273f3df133d61da2ba13b3286261e7efb70",
+ "sha256:2fb481682873035600b5502f0015b664abc26466153fab5c6bc92c1ea69d478b",
+ "sha256:3178d46f363d4549b9a76264f41c6948752183b3f587666aff0555ac50fd7876",
+ "sha256:4367da5705922cf7070462e964f66e4ac24162e22ab0a2e9d31f1b270dd78083",
+ "sha256:4eb85075437f0b1fd8cd66c688469a0c4119e0ba855e3fef86691971b887caf6",
+ "sha256:50a1494ed0c3f5b4d07650a68cd6ca62efe8b596ce743a5c94403e6f11bf06c1",
+ "sha256:53049f3379ef05182864d13bb9686657659407148f901f3f1eee57a733fb4b00",
+ "sha256:6391e59ebe7c62d9902c24a4d8bcbc79a68e7c4ab65863536127c8a9cd94043b",
+ "sha256:67461b5ebca2e4c2ab991733f8ab637a7265bb582f07c7c88914b5afb88cb95b",
+ "sha256:78e47e28ddc4ace41dd38c42e6feecfdadf9c3be2af389abbfeef1ff06822285",
+ "sha256:80ca53981ceeb3241998443c4964a387771588c4e4a5d92735a493af868294f9",
+ "sha256:8a4b2bdb68a447fadebfd7d24855758fe2d6fecc7fed0b78d190b1af39a8e3b0",
+ "sha256:8e45653fb97eb2f20b8c96f9cd2b3a0654d742b47d638cf2897afbd97f80fa6d",
+ "sha256:998cd19189d8a747b226d24c0207fdaa1e6658a1d3f2494541cb9dfbf7dcb6d2",
+ "sha256:a10498349d4c8eab7357a8f9aa3463791292845b79597ad1b98a543686fb1ec8",
+ "sha256:b4cad0cea995af760f82820ab4ca54e5471fc782f70a007f31531957f43e9dee",
+ "sha256:bfe6472507986613dc6cc00b3d492b2f7564b02b3b3682d25ca7f40fa3fd321b",
+ "sha256:c9e0d79ee4c56d841bd4ac6e7697c8ff3c8d6da67379057f29e66acffcd1e9a7",
+ "sha256:ca57eb3ddaccd1112c18fc80abe41db443cc2e9dcb1917078e02dfa010a4f353",
+ "sha256:ce127dd0a6a0811c251a6cddd014d292728484e530d80e872ad9806cfb1c5b3c"
],
"markers": "python_version >= '3.6'",
- "version": "==38.0.3"
+ "version": "==38.0.4"
},
"dataclasses-json": {
"hashes": [
@@ -303,19 +323,19 @@
},
"dateparser": {
"hashes": [
- "sha256:711f7eef6d431225bec56c00e386af3f6a47083276253375bdae1ae6c8d23d4a",
- "sha256:ae7a7de30f26983d09fff802c1f9d35d54e1c11d7ab52ae904a1f3fc037ecba5"
+ "sha256:4431159799b63d8acec5d7d844c5e06edf3d1b0eb2bda6d4cac87134ddddd01c",
+ "sha256:73ec6e44a133c54076ecf9f9dc0fbe3dd4831f154f977ff06f53114d57c5425e"
],
"index": "pypi",
- "version": "==1.1.3"
+ "version": "==1.1.4"
},
"exceptiongroup": {
"hashes": [
- "sha256:2ac84b496be68464a2da60da518af3785fff8b7ec0d090a581604bc870bdee41",
- "sha256:affbabf13fb6e98988c38d9c5650e701569fe3c1de3233cfb61c5f33774690ad"
+ "sha256:542adf9dea4055530d6e1279602fa5cb11dab2395fa650b8674eaec35fc4a828",
+ "sha256:bd14967b79cd9bdb54d97323216f8fdf533e278df937aa2a90089e7d6e06e5ec"
],
"markers": "python_version < '3.11'",
- "version": "==1.0.0"
+ "version": "==1.0.4"
},
"ffmpeg-python": {
"hashes": [
@@ -327,11 +347,11 @@
},
"filelock": {
"hashes": [
- "sha256:55447caa666f2198c5b6b13a26d2084d26fa5b115c00d065664b2124680c4edc",
- "sha256:617eb4e5eedc82fc5f47b6d61e4d11cb837c56cb4544e39081099fa17ad109d4"
+ "sha256:7565f628ea56bfcd8e54e42bdc55da899c85c1abfe1b5bcfd147e9188cebb3b2",
+ "sha256:8df285554452285f79c035efb0c861eb33a4bcfa5b7a137016e32e6a90f9792c"
],
"markers": "python_version >= '3.7'",
- "version": "==3.8.0"
+ "version": "==3.8.2"
},
"flask": {
"hashes": [
@@ -350,27 +370,27 @@
},
"google-api-core": {
"hashes": [
- "sha256:10c06f7739fe57781f87523375e8e1a3a4674bf6392cd6131a3222182b971320",
- "sha256:34f24bd1d5f72a8c4519773d99ca6bf080a6c4e041b4e9f024fe230191dda62e"
+ "sha256:4b9bb5d5a380a0befa0573b302651b8a9a89262c1730e37bf423cec511804c22",
+ "sha256:ce222e27b0de0d7bc63eb043b956996d6dccab14cc3b690aaea91c9cc99dc16e"
],
"markers": "python_version >= '3.7'",
- "version": "==2.10.2"
+ "version": "==2.11.0"
},
"google-api-python-client": {
"hashes": [
- "sha256:2c6611530308b3f931dcf1360713aa3a20cf465d0bf2bac65f2ec99e8c9860de",
- "sha256:b8a0ca8454ad57bc65199044717d3d214197ae1e2d666426bbcd4021b36762e0"
+ "sha256:03624a28b5ba94f3c3d44761081f5dbf8cabaa20c5c3a96c046457c5713efb9b",
+ "sha256:bc2447a7479006d98927fb20faa74d892d3758ff68e99b621367632bc42b8af8"
],
"index": "pypi",
- "version": "==2.65.0"
+ "version": "==2.69.0"
},
"google-auth": {
"hashes": [
- "sha256:1ad5b0e6eba5f69645971abb3d2c197537d5914070a8c6d30299dfdb07c5c700",
- "sha256:cf24817855d874ede2efd071aa22125445f555de1685b739a9782fcf408c2a3d"
+ "sha256:6897b93556d8d807ad70701bb89f000183aea366ca7ed94680828b37437a4994",
+ "sha256:72f12a6cfc968d754d7bdab369c5c5c16032106e52d32c6dfd8484e4c01a6d1f"
],
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4, 3.5'",
- "version": "==2.14.0"
+ "version": "==2.15.0"
},
"google-auth-httplib2": {
"hashes": [
@@ -382,27 +402,27 @@
},
"google-auth-oauthlib": {
"hashes": [
- "sha256:53019edbde83e08ff0740eefc5bded7e26a289941d12e7ae1f0f5bacf2faa031",
- "sha256:db11bce4b3effc99b518ec22a2903470e0853c0c92be57694e3684e738d22513"
+ "sha256:40cc612a13c3336d5433e94e2adb42a0c88f6feb6c55769e44500fc70043a576",
+ "sha256:81056a310fb1c4a3e5a7e1a443e1eb96593c6bbc55b26c0261e4d3295d3e6593"
],
"index": "pypi",
- "version": "==0.7.0"
+ "version": "==0.8.0"
},
"googleapis-common-protos": {
"hashes": [
- "sha256:8eb2cbc91b69feaf23e32452a7ae60e791e09967d81d4fcc7fc388182d1bd394",
- "sha256:c25873c47279387cfdcbdafa36149887901d36202cb645a0e4f29686bf6e4417"
+ "sha256:27a849d6205838fb6cc3c1c21cb9800707a661bb21c6ce7fb13e99eb1f8a0c46",
+ "sha256:a9f4a1d7f6d9809657b7f1316a1aa527f6664891531bcfcc13b6696e685f443c"
],
"markers": "python_version >= '3.7'",
- "version": "==1.56.4"
+ "version": "==1.57.0"
},
"gspread": {
"hashes": [
- "sha256:41f7a416425f1ec5a1b677f49b8fbf599102766c27ed7be6601a58c9a1550ebc",
- "sha256:d3bbff4b7aad0fc2c986458e148537a02fe7b46e7162f41f3a42392bfa2adb89"
+ "sha256:ce76f9c16b88ccb792350142224a59afa8e69f7463f3d3417148cbe892efc7cb",
+ "sha256:dbeedd08c6a7f7b0bfc1a54e17c29205362250c77bf98e11125c5d99fd7f4ba7"
],
"index": "pypi",
- "version": "==5.6.2"
+ "version": "==5.7.2"
},
"h11": {
"hashes": [
@@ -591,11 +611,11 @@
},
"marshmallow": {
"hashes": [
- "sha256:35e02a3a06899c9119b785c12a22f4cda361745d66a71ab691fd7610202ae104",
- "sha256:6804c16114f7fce1f5b4dadc31f4674af23317fcc7f075da21e35c1a35d781f7"
+ "sha256:90032c0fd650ce94b6ec6dc8dfeb0e3ff50c144586462c389b81a07205bedb78",
+ "sha256:93f0958568da045b0021ec6aeb7ac37c81bfcccbb9a0e7ed8559885070b3a19b"
],
"markers": "python_version >= '3.7'",
- "version": "==3.18.0"
+ "version": "==3.19.0"
},
"marshmallow-enum": {
"hashes": [
@@ -645,31 +665,31 @@
},
"packaging": {
"hashes": [
- "sha256:dd47c42927d89ab911e606518907cc2d3a1f38bbd026385970643f9c5b8ecfeb",
- "sha256:ef103e05f519cdc783ae24ea4e2e0f508a9c99b2d4969652eed6a2e1ea5bd522"
+ "sha256:2198ec20bd4c017b8f9717e00f0c8714076fc2fd93816750ab48e2c41de2cfd3",
+ "sha256:957e2148ba0e1a3b282772e791ef1d8083648bc131c8ab0c1feba110ce1146c3"
],
- "markers": "python_version >= '3.6'",
- "version": "==21.3"
+ "markers": "python_version >= '3.7'",
+ "version": "==22.0"
},
"protobuf": {
"hashes": [
- "sha256:2c9c2ed7466ad565f18668aa4731c535511c5d9a40c6da39524bccf43e441719",
- "sha256:48e2cd6b88c6ed3d5877a3ea40df79d08374088e89bedc32557348848dff250b",
- "sha256:5b0834e61fb38f34ba8840d7dcb2e5a2f03de0c714e0293b3963b79db26de8ce",
- "sha256:61f21493d96d2a77f9ca84fefa105872550ab5ef71d21c458eb80edcf4885a99",
- "sha256:6e0be9f09bf9b6cf497b27425487706fa48c6d1632ddd94dab1a5fe11a422392",
- "sha256:6e312e280fbe3c74ea9e080d9e6080b636798b5e3939242298b591064470b06b",
- "sha256:7eb8f2cc41a34e9c956c256e3ac766cf4e1a4c9c925dc757a41a01be3e852965",
- "sha256:84ea107016244dfc1eecae7684f7ce13c788b9a644cd3fca5b77871366556444",
- "sha256:9227c14010acd9ae7702d6467b4625b6fe853175a6b150e539b21d2b2f2b409c",
- "sha256:a419cc95fca8694804709b8c4f2326266d29659b126a93befe210f5bbc772536",
- "sha256:a7d0ea43949d45b836234f4ebb5ba0b22e7432d065394b532cdca8f98415e3cf",
- "sha256:b5ab0b8918c136345ff045d4b3d5f719b505b7c8af45092d7f45e304f55e50a1",
- "sha256:e575c57dc8b5b2b2caa436c16d44ef6981f2235eb7179bfc847557886376d740",
- "sha256:f9eae277dd240ae19bb06ff4e2346e771252b0e619421965504bd1b1bba7c5fa"
+ "sha256:25266bf373ee06d5d66f9eb1ec9d434b243dccce5c32faf151054cfa6f9dcbf1",
+ "sha256:260e346927fd4e6fbb49ab545137b19610c24a1d853dc5f29ddf777ab1987211",
+ "sha256:2c6a4d13732d9b094db31b3841986c38b17ac61a3fe05ee26a779d94c4c3fb43",
+ "sha256:4922e3320ed70e81f05060822da36923d09fd9e04e17f411f2d8d8d0070f9f5c",
+ "sha256:4b75c947289a2e9c1f37d21c593f1ef6fb4fed33977dfb2ac84f799eb29a8ff4",
+ "sha256:4d01ef83517c181d60ea1c6d0b2f644be250ade740d6554a2f5a021b1ad622e3",
+ "sha256:553e35c0878f6855e55f01a14561e6bce6df79b6636a5acf83b9d9ac7eab7922",
+ "sha256:85ccb4753ee21de7dc81a7a68a051f25dbe133ffa01a639ac998427d0b223387",
+ "sha256:a5a14b907a191319e7a58b38c583bbf50deb21e002f723a912c5e4f6969a778e",
+ "sha256:a944dc9550baae276afc7dc8193191d4c2ad660270a1e5ed5a71539817ebe2e2",
+ "sha256:bab4b21a986ded225b9392c07ce21c35d790951f51e1ebfd32e4d443b05c3726",
+ "sha256:c3b9e329b4c247dc3ba5c50f60915a84e08278eb6d9e3fa674d0d04ff816bfd7",
+ "sha256:d91a47c77b33580024b0271b65bb820c4e0264c25eb49151ad01e691de8fa0b6",
+ "sha256:efb16b16fd3eef25357f84d516062753014b76279ce4e0ec4880badd2fba7370"
],
"markers": "python_version >= '3.7'",
- "version": "==4.21.9"
+ "version": "==4.21.11"
},
"pyaes": {
"hashes": [
@@ -722,39 +742,35 @@
},
"pycryptodomex": {
"hashes": [
- "sha256:04cc393045a8f19dd110c975e30f38ed7ab3faf21ede415ea67afebd95a22380",
- "sha256:0776bfaf2c48154ab54ea45392847c1283d2fcf64e232e85565f858baedfc1fa",
- "sha256:0fadb9f7fa3150577800eef35f62a8a24b9ddf1563ff060d9bd3af22d3952c8c",
- "sha256:18e2ab4813883ae63396c0ffe50b13554b32bb69ec56f0afaf052e7a7ae0d55b",
- "sha256:191e73bc84a8064ad1874dba0ebadedd7cce4dedee998549518f2c74a003b2e1",
- "sha256:35a8f7afe1867118330e2e0e0bf759c409e28557fb1fc2fbb1c6c937297dbe9a",
- "sha256:3709f13ca3852b0b07fc04a2c03b379189232b24007c466be0f605dd4723e9d4",
- "sha256:4540904c09704b6f831059c0dfb38584acb82cb97b0125cd52688c1f1e3fffa6",
- "sha256:463119d7d22d0fc04a0f9122e9d3e6121c6648bcb12a052b51bd1eed1b996aa2",
- "sha256:46b3f05f2f7ac7841053da4e0f69616929ca3c42f238c405f6c3df7759ad2780",
- "sha256:48697790203909fab02a33226fda546604f4e2653f9d47bc5d3eb40879fa7c64",
- "sha256:5676a132169a1c1a3712edf25250722ebc8c9102aa9abd814df063ca8362454f",
- "sha256:65204412d0c6a8e3c41e21e93a5e6054a74fea501afa03046a388cf042e3377a",
- "sha256:67e1e6a92151023ccdfcfbc0afb3314ad30080793b4c27956ea06ab1fb9bcd8a",
- "sha256:6f5b6ba8aefd624834bc177a2ac292734996bb030f9d1b388e7504103b6fcddf",
- "sha256:7341f1bb2dadb0d1a0047f34c3a58208a92423cdbd3244d998e4b28df5eac0ed",
- "sha256:78d9621cf0ea35abf2d38fa2ca6d0634eab6c991a78373498ab149953787e5e5",
- "sha256:8eecdf9cdc7343001d047f951b9cc805cd68cb6cd77b20ea46af5bffc5bd3dfb",
- "sha256:94c7b60e1f52e1a87715571327baea0733708ab4723346598beca4a3b6879794",
- "sha256:996e1ba717077ce1e6d4849af7a1426f38b07b3d173b879e27d5e26d2e958beb",
- "sha256:a07a64709e366c2041cd5cfbca592b43998bf4df88f7b0ca73dca37071ccf1bd",
- "sha256:b6306403228edde6e289f626a3908a2f7f67c344e712cf7c0a508bab3ad9e381",
- "sha256:b9279adc16e4b0f590ceff581f53a80179b02cba9056010d733eb4196134a870",
- "sha256:c4cb9cb492ea7dcdf222a8d19a1d09002798ea516aeae8877245206d27326d86",
- "sha256:dd452a5af7014e866206d41751886c9b4bf379a339fdf2dbfc7dd16c0fb4f8e0",
- "sha256:e2b12968522a0358b8917fc7b28865acac002f02f4c4c6020fcb264d76bfd06d",
- "sha256:e3164a18348bd53c69b4435ebfb4ac8a4076291ffa2a70b54f0c4b80c7834b1d",
- "sha256:e47bf8776a7e15576887f04314f5228c6527b99946e6638cf2f16da56d260cab",
- "sha256:f8be976cec59b11f011f790b88aca67b4ea2bd286578d0bd3e31bcd19afcd3e4",
- "sha256:fc9bc7a9b79fe5c750fc81a307052f8daabb709bdaabb0fb18fb136b66b653b5"
+ "sha256:04610536921c1ec7adba158ef570348550c9f3a40bc24be9f8da2ef7ab387981",
+ "sha256:0ba28aa97cdd3ff5ed1a4f2b7f5cd04e721166bd75bd2b929e2734433882b583",
+ "sha256:0da835af786fdd1c9930994c78b23e88d816dc3f99aa977284a21bbc26d19735",
+ "sha256:1619087fb5b31510b0b0b058a54f001a5ffd91e6ffee220d9913064519c6a69d",
+ "sha256:1cda60207be8c1cf0b84b9138f9e3ca29335013d2b690774a5e94678ff29659a",
+ "sha256:22aed0868622d95179217c298e37ed7410025c7b29dac236d3230617d1e4ed56",
+ "sha256:231dc8008cbdd1ae0e34645d4523da2dbc7a88c325f0d4a59635a86ee25b41dd",
+ "sha256:2ad9bb86b355b6104796567dd44c215b3dc953ef2fae5e0bdfb8516731df92cf",
+ "sha256:4dbbe18cc232b5980c7633972ae5417d0df76fe89e7db246eefd17ef4d8e6d7a",
+ "sha256:6a465e4f856d2a4f2a311807030c89166529ccf7ccc65bef398de045d49144b6",
+ "sha256:70288d9bfe16b2fd0d20b6c365db614428f1bcde7b20d56e74cf88ade905d9eb",
+ "sha256:7993d26dae4d83b8f4ce605bb0aecb8bee330bb3c95475ef06f3694403621e71",
+ "sha256:8851585ff19871e5d69e1790f4ca5f6fd1699d6b8b14413b472a4c0dbc7ea780",
+ "sha256:893f8a97d533c66cc3a56e60dd3ed40a3494ddb4aafa7e026429a08772f8a849",
+ "sha256:8dd2d9e3c617d0712ed781a77efd84ea579e76c5f9b2a4bc0b684ebeddf868b2",
+ "sha256:a1c0ae7123448ecb034c75c713189cb00ebe2d415b11682865b6c54d200d9c93",
+ "sha256:b0789a8490114a2936ed77c87792cfe77582c829cb43a6d86ede0f9624ba8aa3",
+ "sha256:b3d04c00d777c36972b539fb79958790126847d84ec0129fce1efef250bfe3ce",
+ "sha256:ba57ac7861fd2c837cdb33daf822f2a052ff57dd769a2107807f52a36d0e8d38",
+ "sha256:ce338a9703f54b2305a408fc9890eb966b727ce72b69f225898bb4e9d9ed3f1f",
+ "sha256:daa67f5ebb6fbf1ee9c90decaa06ca7fc88a548864e5e484d52b0920a57fe8a5",
+ "sha256:e2453162f473c1eae4826eb10cd7bce19b5facac86d17fb5f29a570fde145abd",
+ "sha256:e25a2f5667d91795f9417cb856f6df724ccdb0cdd5cbadb212ee9bf43946e9f8",
+ "sha256:e5a670919076b71522c7d567a9043f66f14b202414a63c3a078b5831ae342c03",
+ "sha256:e9ba9d8ed638733c9e95664470b71d624a6def149e2db6cc52c1aca5a6a2df1d",
+ "sha256:f2b971a7b877348a27dcfd0e772a0343fb818df00b74078e91c008632284137d"
],
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'",
- "version": "==3.15.0"
+ "version": "==3.16.0"
},
"pygments": {
"hashes": [
@@ -798,19 +814,19 @@
},
"python-slugify": {
"hashes": [
- "sha256:272d106cb31ab99b3496ba085e3fea0e9e76dcde967b5e9992500d1f785ce4e1",
- "sha256:7b2c274c308b62f4269a9ba701aa69a797e9bca41aeee5b3a9e79e36b6656927"
+ "sha256:003aee64f9fd955d111549f96c4b58a3f40b9319383c70fad6277a4974bbf570",
+ "sha256:7a0f21a39fa6c1c4bf2e5984c9b9ae944483fd10b54804cb0e23a3ccd4954f0b"
],
"index": "pypi",
- "version": "==6.1.2"
+ "version": "==7.0.0"
},
"python-twitter-v2": {
"hashes": [
- "sha256:04349e74ec6ebaa3c71d02dc82610acd3b6b346a0060adf4bad2379fd3f46701",
- "sha256:1b17b3243108a7d8d1af0b71a3e87f28d105b5fe61cfd09944e28a7903769c81"
+ "sha256:18c14853da8b499775a11a3f5e1d0692a7017fa41eca91ac5afa73f35b935a90",
+ "sha256:fbe582ae7c6b33f6055b97e23dd106874e6650091d257fe67bfd024b96ebf8d6"
],
"index": "pypi",
- "version": "==0.7.9"
+ "version": "==0.8.0"
},
"pytz": {
"hashes": [
@@ -875,83 +891,97 @@
},
"regex": {
"hashes": [
- "sha256:0008650041531d0eadecc96a73d37c2dc4821cf51b0766e374cb4f1ddc4e1c14",
- "sha256:03299b0bcaa7824eb7c0ebd7ef1e3663302d1b533653bfe9dc7e595d453e2ae9",
- "sha256:06b1df01cf2aef3a9790858af524ae2588762c8a90e784ba00d003f045306204",
- "sha256:09b4b6ccc61d4119342b26246ddd5a04accdeebe36bdfe865ad87a0784efd77f",
- "sha256:0be0c34a39e5d04a62fd5342f0886d0e57592a4f4993b3f9d257c1f688b19737",
- "sha256:0d96eec8550fd2fd26f8e675f6d8b61b159482ad8ffa26991b894ed5ee19038b",
- "sha256:0eb0e2845e81bdea92b8281a3969632686502565abf4a0b9e4ab1471c863d8f3",
- "sha256:13bbf0c9453c6d16e5867bda7f6c0c7cff1decf96c5498318bb87f8136d2abd4",
- "sha256:17e51ad1e6131c496b58d317bc9abec71f44eb1957d32629d06013a21bc99cac",
- "sha256:1977bb64264815d3ef016625adc9df90e6d0e27e76260280c63eca993e3f455f",
- "sha256:1e30762ddddb22f7f14c4f59c34d3addabc789216d813b0f3e2788d7bcf0cf29",
- "sha256:1e73652057473ad3e6934944af090852a02590c349357b79182c1b681da2c772",
- "sha256:20e6a27959f162f979165e496add0d7d56d7038237092d1aba20b46de79158f1",
- "sha256:286ff9ec2709d56ae7517040be0d6c502642517ce9937ab6d89b1e7d0904f863",
- "sha256:297c42ede2c81f0cb6f34ea60b5cf6dc965d97fa6936c11fc3286019231f0d66",
- "sha256:320c2f4106962ecea0f33d8d31b985d3c185757c49c1fb735501515f963715ed",
- "sha256:35ed2f3c918a00b109157428abfc4e8d1ffabc37c8f9abc5939ebd1e95dabc47",
- "sha256:3d146e5591cb67c5e836229a04723a30af795ef9b70a0bbd913572e14b7b940f",
- "sha256:42bb37e2b2d25d958c25903f6125a41aaaa1ed49ca62c103331f24b8a459142f",
- "sha256:42d6007722d46bd2c95cce700181570b56edc0dcbadbfe7855ec26c3f2d7e008",
- "sha256:43eba5c46208deedec833663201752e865feddc840433285fbadee07b84b464d",
- "sha256:452519bc4c973e961b1620c815ea6dd8944a12d68e71002be5a7aff0a8361571",
- "sha256:4b9c16a807b17b17c4fa3a1d8c242467237be67ba92ad24ff51425329e7ae3d0",
- "sha256:5510932596a0f33399b7fff1bd61c59c977f2b8ee987b36539ba97eb3513584a",
- "sha256:55820bc631684172b9b56a991d217ec7c2e580d956591dc2144985113980f5a3",
- "sha256:57484d39447f94967e83e56db1b1108c68918c44ab519b8ecfc34b790ca52bf7",
- "sha256:58ba41e462653eaf68fc4a84ec4d350b26a98d030be1ab24aba1adcc78ffe447",
- "sha256:5bc5f921be39ccb65fdda741e04b2555917a4bced24b4df14eddc7569be3b493",
- "sha256:5dcc4168536c8f68654f014a3db49b6b4a26b226f735708be2054314ed4964f4",
- "sha256:5f92a7cdc6a0ae2abd184e8dfd6ef2279989d24c85d2c85d0423206284103ede",
- "sha256:67250b36edfa714ba62dc62d3f238e86db1065fccb538278804790f578253640",
- "sha256:6df070a986fc064d865c381aecf0aaff914178fdf6874da2f2387e82d93cc5bd",
- "sha256:729aa8ca624c42f309397c5fc9e21db90bf7e2fdd872461aabdbada33de9063c",
- "sha256:72bc3a5effa5974be6d965ed8301ac1e869bc18425c8a8fac179fbe7876e3aee",
- "sha256:74d86e8924835f863c34e646392ef39039405f6ce52956d8af16497af4064a30",
- "sha256:79e5af1ff258bc0fe0bdd6f69bc4ae33935a898e3cbefbbccf22e88a27fa053b",
- "sha256:7b103dffb9f6a47ed7ffdf352b78cfe058b1777617371226c1894e1be443afec",
- "sha256:83f03f0bd88c12e63ca2d024adeee75234d69808b341e88343b0232329e1f1a1",
- "sha256:86d7a68fa53688e1f612c3246044157117403c7ce19ebab7d02daf45bd63913e",
- "sha256:878c626cbca3b649e14e972c14539a01191d79e58934e3f3ef4a9e17f90277f8",
- "sha256:878f5d649ba1db9f52cc4ef491f7dba2d061cdc48dd444c54260eebc0b1729b9",
- "sha256:87bc01226cd288f0bd9a4f9f07bf6827134dc97a96c22e2d28628e824c8de231",
- "sha256:8babb2b5751105dc0aef2a2e539f4ba391e738c62038d8cb331c710f6b0f3da7",
- "sha256:91e0f7e7be77250b808a5f46d90bf0032527d3c032b2131b63dee54753a4d729",
- "sha256:9557545c10d52c845f270b665b52a6a972884725aa5cf12777374e18f2ea8960",
- "sha256:9ccb0a4ab926016867260c24c192d9df9586e834f5db83dfa2c8fffb3a6e5056",
- "sha256:9d828c5987d543d052b53c579a01a52d96b86f937b1777bbfe11ef2728929357",
- "sha256:9efa41d1527b366c88f265a227b20bcec65bda879962e3fc8a2aee11e81266d7",
- "sha256:aaf5317c961d93c1a200b9370fb1c6b6836cc7144fef3e5a951326912bf1f5a3",
- "sha256:ab69b4fe09e296261377d209068d52402fb85ef89dc78a9ac4a29a895f4e24a7",
- "sha256:ad397bc7d51d69cb07ef89e44243f971a04ce1dca9bf24c992c362406c0c6573",
- "sha256:ae17fc8103f3b63345709d3e9654a274eee1c6072592aec32b026efd401931d0",
- "sha256:af4d8cc28e4c7a2f6a9fed544228c567340f8258b6d7ea815b62a72817bbd178",
- "sha256:b22ff939a8856a44f4822da38ef4868bd3a9ade22bb6d9062b36957c850e404f",
- "sha256:b549d851f91a4efb3e65498bd4249b1447ab6035a9972f7fc215eb1f59328834",
- "sha256:be319f4eb400ee567b722e9ea63d5b2bb31464e3cf1b016502e3ee2de4f86f5c",
- "sha256:c0446b2871335d5a5e9fcf1462f954586b09a845832263db95059dcd01442015",
- "sha256:c68d2c04f7701a418ec2e5631b7f3552efc32f6bcc1739369c6eeb1af55f62e0",
- "sha256:c87ac58b9baaf50b6c1b81a18d20eda7e2883aa9a4fb4f1ca70f2e443bfcdc57",
- "sha256:caa2734ada16a44ae57b229d45091f06e30a9a52ace76d7574546ab23008c635",
- "sha256:cb34c2d66355fb70ae47b5595aafd7218e59bb9c00ad8cc3abd1406ca5874f07",
- "sha256:cb3652bbe6720786b9137862205986f3ae54a09dec8499a995ed58292bdf77c2",
- "sha256:cf668f26604e9f7aee9f8eaae4ca07a948168af90b96be97a4b7fa902a6d2ac1",
- "sha256:d326ff80ed531bf2507cba93011c30fff2dd51454c85f55df0f59f2030b1687b",
- "sha256:d6c2441538e4fadd4291c8420853431a229fcbefc1bf521810fbc2629d8ae8c2",
- "sha256:d6ecfd1970b3380a569d7b3ecc5dd70dba295897418ed9e31ec3c16a5ab099a5",
- "sha256:e5602a9b5074dcacc113bba4d2f011d2748f50e3201c8139ac5b68cf2a76bd8b",
- "sha256:ef806f684f17dbd6263d72a54ad4073af42b42effa3eb42b877e750c24c76f86",
- "sha256:f3356afbb301ec34a500b8ba8b47cba0b44ed4641c306e1dd981a08b416170b5",
- "sha256:f6f7ee2289176cb1d2c59a24f50900f8b9580259fa9f1a739432242e7d254f93",
- "sha256:f7e8f1ee28e0a05831c92dc1c0c1c94af5289963b7cf09eca5b5e3ce4f8c91b0",
- "sha256:f8169ec628880bdbca67082a9196e2106060a4a5cbd486ac51881a4df805a36f",
- "sha256:fbc88d3ba402b5d041d204ec2449c4078898f89c4a6e6f0ed1c1a510ef1e221d",
- "sha256:fbd3fe37353c62fd0eb19fb76f78aa693716262bcd5f9c14bb9e5aca4b3f0dc4"
+ "sha256:052b670fafbe30966bbe5d025e90b2a491f85dfe5b2583a163b5e60a85a321ad",
+ "sha256:0653d012b3bf45f194e5e6a41df9258811ac8fc395579fa82958a8b76286bea4",
+ "sha256:0a069c8483466806ab94ea9068c34b200b8bfc66b6762f45a831c4baaa9e8cdd",
+ "sha256:0cf0da36a212978be2c2e2e2d04bdff46f850108fccc1851332bcae51c8907cc",
+ "sha256:131d4be09bea7ce2577f9623e415cab287a3c8e0624f778c1d955ec7c281bd4d",
+ "sha256:144486e029793a733e43b2e37df16a16df4ceb62102636ff3db6033994711066",
+ "sha256:1ddf14031a3882f684b8642cb74eea3af93a2be68893901b2b387c5fd92a03ec",
+ "sha256:1eba476b1b242620c266edf6325b443a2e22b633217a9835a52d8da2b5c051f9",
+ "sha256:20f61c9944f0be2dc2b75689ba409938c14876c19d02f7585af4460b6a21403e",
+ "sha256:22960019a842777a9fa5134c2364efaed5fbf9610ddc5c904bd3a400973b0eb8",
+ "sha256:22e7ebc231d28393dfdc19b185d97e14a0f178bedd78e85aad660e93b646604e",
+ "sha256:23cbb932cc53a86ebde0fb72e7e645f9a5eec1a5af7aa9ce333e46286caef783",
+ "sha256:29c04741b9ae13d1e94cf93fca257730b97ce6ea64cfe1eba11cf9ac4e85afb6",
+ "sha256:2bde29cc44fa81c0a0c8686992c3080b37c488df167a371500b2a43ce9f026d1",
+ "sha256:2cdc55ca07b4e70dda898d2ab7150ecf17c990076d3acd7a5f3b25cb23a69f1c",
+ "sha256:370f6e97d02bf2dd20d7468ce4f38e173a124e769762d00beadec3bc2f4b3bc4",
+ "sha256:395161bbdbd04a8333b9ff9763a05e9ceb4fe210e3c7690f5e68cedd3d65d8e1",
+ "sha256:44136355e2f5e06bf6b23d337a75386371ba742ffa771440b85bed367c1318d1",
+ "sha256:44a6c2f6374e0033873e9ed577a54a3602b4f609867794c1a3ebba65e4c93ee7",
+ "sha256:4919899577ba37f505aaebdf6e7dc812d55e8f097331312db7f1aab18767cce8",
+ "sha256:4b4b1fe58cd102d75ef0552cf17242705ce0759f9695334a56644ad2d83903fe",
+ "sha256:4bdd56ee719a8f751cf5a593476a441c4e56c9b64dc1f0f30902858c4ef8771d",
+ "sha256:4bf41b8b0a80708f7e0384519795e80dcb44d7199a35d52c15cc674d10b3081b",
+ "sha256:4cac3405d8dda8bc6ed499557625585544dd5cbf32072dcc72b5a176cb1271c8",
+ "sha256:4fe7fda2fe7c8890d454f2cbc91d6c01baf206fbc96d89a80241a02985118c0c",
+ "sha256:50921c140561d3db2ab9f5b11c5184846cde686bb5a9dc64cae442926e86f3af",
+ "sha256:5217c25229b6a85049416a5c1e6451e9060a1edcf988641e309dbe3ab26d3e49",
+ "sha256:5352bea8a8f84b89d45ccc503f390a6be77917932b1c98c4cdc3565137acc714",
+ "sha256:542e3e306d1669b25936b64917285cdffcd4f5c6f0247636fec037187bd93542",
+ "sha256:543883e3496c8b6d58bd036c99486c3c8387c2fc01f7a342b760c1ea3158a318",
+ "sha256:586b36ebda81e6c1a9c5a5d0bfdc236399ba6595e1397842fd4a45648c30f35e",
+ "sha256:597f899f4ed42a38df7b0e46714880fb4e19a25c2f66e5c908805466721760f5",
+ "sha256:5a260758454580f11dd8743fa98319bb046037dfab4f7828008909d0aa5292bc",
+ "sha256:5aefb84a301327ad115e9d346c8e2760009131d9d4b4c6b213648d02e2abe144",
+ "sha256:5e6a5567078b3eaed93558842346c9d678e116ab0135e22eb72db8325e90b453",
+ "sha256:5ff525698de226c0ca743bfa71fc6b378cda2ddcf0d22d7c37b1cc925c9650a5",
+ "sha256:61edbca89aa3f5ef7ecac8c23d975fe7261c12665f1d90a6b1af527bba86ce61",
+ "sha256:659175b2144d199560d99a8d13b2228b85e6019b6e09e556209dfb8c37b78a11",
+ "sha256:6a9a19bea8495bb419dc5d38c4519567781cd8d571c72efc6aa959473d10221a",
+ "sha256:6b30bddd61d2a3261f025ad0f9ee2586988c6a00c780a2fb0a92cea2aa702c54",
+ "sha256:6ffd55b5aedc6f25fd8d9f905c9376ca44fcf768673ffb9d160dd6f409bfda73",
+ "sha256:702d8fc6f25bbf412ee706bd73019da5e44a8400861dfff7ff31eb5b4a1276dc",
+ "sha256:74bcab50a13960f2a610cdcd066e25f1fd59e23b69637c92ad470784a51b1347",
+ "sha256:75f591b2055523fc02a4bbe598aa867df9e953255f0b7f7715d2a36a9c30065c",
+ "sha256:763b64853b0a8f4f9cfb41a76a4a85a9bcda7fdda5cb057016e7706fde928e66",
+ "sha256:76c598ca73ec73a2f568e2a72ba46c3b6c8690ad9a07092b18e48ceb936e9f0c",
+ "sha256:78d680ef3e4d405f36f0d6d1ea54e740366f061645930072d39bca16a10d8c93",
+ "sha256:7b280948d00bd3973c1998f92e22aa3ecb76682e3a4255f33e1020bd32adf443",
+ "sha256:7db345956ecce0c99b97b042b4ca7326feeec6b75facd8390af73b18e2650ffc",
+ "sha256:7dbdce0c534bbf52274b94768b3498abdf675a691fec5f751b6057b3030f34c1",
+ "sha256:7ef6b5942e6bfc5706301a18a62300c60db9af7f6368042227ccb7eeb22d0892",
+ "sha256:7f5a3ffc731494f1a57bd91c47dc483a1e10048131ffb52d901bfe2beb6102e8",
+ "sha256:8a45b6514861916c429e6059a55cf7db74670eaed2052a648e3e4d04f070e001",
+ "sha256:8ad241da7fac963d7573cc67a064c57c58766b62a9a20c452ca1f21050868dfa",
+ "sha256:8b0886885f7323beea6f552c28bff62cbe0983b9fbb94126531693ea6c5ebb90",
+ "sha256:8ca88da1bd78990b536c4a7765f719803eb4f8f9971cc22d6ca965c10a7f2c4c",
+ "sha256:8e0caeff18b96ea90fc0eb6e3bdb2b10ab5b01a95128dfeccb64a7238decf5f0",
+ "sha256:957403a978e10fb3ca42572a23e6f7badff39aa1ce2f4ade68ee452dc6807692",
+ "sha256:9af69f6746120998cd9c355e9c3c6aec7dff70d47247188feb4f829502be8ab4",
+ "sha256:9c94f7cc91ab16b36ba5ce476f1904c91d6c92441f01cd61a8e2729442d6fcf5",
+ "sha256:a37d51fa9a00d265cf73f3de3930fa9c41548177ba4f0faf76e61d512c774690",
+ "sha256:a3a98921da9a1bf8457aeee6a551948a83601689e5ecdd736894ea9bbec77e83",
+ "sha256:a3c1ebd4ed8e76e886507c9eddb1a891673686c813adf889b864a17fafcf6d66",
+ "sha256:a5f9505efd574d1e5b4a76ac9dd92a12acb2b309551e9aa874c13c11caefbe4f",
+ "sha256:a8ff454ef0bb061e37df03557afda9d785c905dab15584860f982e88be73015f",
+ "sha256:a9d0b68ac1743964755ae2d89772c7e6fb0118acd4d0b7464eaf3921c6b49dd4",
+ "sha256:aa62a07ac93b7cb6b7d0389d8ef57ffc321d78f60c037b19dfa78d6b17c928ee",
+ "sha256:ac741bf78b9bb432e2d314439275235f41656e189856b11fb4e774d9f7246d81",
+ "sha256:ae1e96785696b543394a4e3f15f3f225d44f3c55dafe3f206493031419fedf95",
+ "sha256:b683e5fd7f74fb66e89a1ed16076dbab3f8e9f34c18b1979ded614fe10cdc4d9",
+ "sha256:b7a8b43ee64ca8f4befa2bea4083f7c52c92864d8518244bfa6e88c751fa8fff",
+ "sha256:b8e38472739028e5f2c3a4aded0ab7eadc447f0d84f310c7a8bb697ec417229e",
+ "sha256:bfff48c7bd23c6e2aec6454aaf6edc44444b229e94743b34bdcdda2e35126cf5",
+ "sha256:c14b63c9d7bab795d17392c7c1f9aaabbffd4cf4387725a0ac69109fb3b550c6",
+ "sha256:c27cc1e4b197092e50ddbf0118c788d9977f3f8f35bfbbd3e76c1846a3443df7",
+ "sha256:c28d3309ebd6d6b2cf82969b5179bed5fefe6142c70f354ece94324fa11bf6a1",
+ "sha256:c670f4773f2f6f1957ff8a3962c7dd12e4be54d05839b216cb7fd70b5a1df394",
+ "sha256:ce6910b56b700bea7be82c54ddf2e0ed792a577dfaa4a76b9af07d550af435c6",
+ "sha256:d0213671691e341f6849bf33cd9fad21f7b1cb88b89e024f33370733fec58742",
+ "sha256:d03fe67b2325cb3f09be029fd5da8df9e6974f0cde2c2ac6a79d2634e791dd57",
+ "sha256:d0e5af9a9effb88535a472e19169e09ce750c3d442fb222254a276d77808620b",
+ "sha256:d243b36fbf3d73c25e48014961e83c19c9cc92530516ce3c43050ea6276a2ab7",
+ "sha256:d26166acf62f731f50bdd885b04b38828436d74e8e362bfcb8df221d868b5d9b",
+ "sha256:d403d781b0e06d2922435ce3b8d2376579f0c217ae491e273bab8d092727d244",
+ "sha256:d8716f82502997b3d0895d1c64c3b834181b1eaca28f3f6336a71777e437c2af",
+ "sha256:e4f781ffedd17b0b834c8731b75cce2639d5a8afe961c1e58ee7f1f20b3af185",
+ "sha256:e613a98ead2005c4ce037c7b061f2409a1a4e45099edb0ef3200ee26ed2a69a8",
+ "sha256:ef4163770525257876f10e8ece1cf25b71468316f61451ded1a6f44273eedeb5"
],
"markers": "python_version >= '3.6'",
- "version": "==2022.3.2"
+ "version": "==2022.10.31"
},
"requests": {
"hashes": [
@@ -1003,10 +1033,11 @@
},
"selenium": {
"hashes": [
- "sha256:a733dd77d3171b846893f4d51b18967d809313f547a10974e26579f9ce797462"
+ "sha256:06a1c7d9f313130b21c3218ddd8852070d0e7419afdd31f96160cd576555a5ce",
+ "sha256:3aefa14a28a42e520550c1cd0f29cf1d566328186ea63aa9a3e01fb265b5894d"
],
"index": "pypi",
- "version": "==4.5.0"
+ "version": "==4.7.2"
},
"six": {
"hashes": [
@@ -1049,11 +1080,11 @@
},
"telethon": {
"hashes": [
- "sha256:8df802aad2d11f7198f1d5b1d84c7498ef19c28e160041dcb8aaf0814f91115b",
- "sha256:a085348801bd62db79ad75c9a67c5c8312507b113f0228b92e2dd4397edc7c1d"
+ "sha256:148ac8c27908853d5d8a116d55ce947e9ba167bb697c75226ae95645b2e5a504",
+ "sha256:de7a1619110a2c06390fb5340839c6503c6b108b5f1a2f3bbe1ef60f02cecacb"
],
"index": "pypi",
- "version": "==1.25.4"
+ "version": "==1.26.0"
},
"text-unidecode": {
"hashes": [
@@ -1074,7 +1105,7 @@
"sha256:5f4f682a004951c1b450bc753c710e9280c5746ce6ffedee253ddbcbf54cf1e4",
"sha256:6fee160d6ffcd1b1c68c65f14c829c22832bc401726335ce92c52d395944a6a1"
],
- "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'",
+ "index": "pypi",
"version": "==4.64.1"
},
"trio": {
@@ -1110,11 +1141,11 @@
},
"tzdata": {
"hashes": [
- "sha256:04a680bdc5b15750c39c12a448885a51134a27ec9af83667663f0b3a1bf3f342",
- "sha256:91f11db4503385928c15598c98573e3af07e7229181bee5375bd30f1695ddcae"
+ "sha256:2b88858b0e3120792a3c0635c23daf36a7d7eeeca657c323da299d2094402a0d",
+ "sha256:fe5f866eddd8b96e9fcba978f8e503c909b19ea7efda11e52e39494bad3a7bfa"
],
"markers": "python_version >= '3.6'",
- "version": "==2022.6"
+ "version": "==2022.7"
},
"tzlocal": {
"hashes": [
@@ -1149,11 +1180,11 @@
},
"vk-url-scraper": {
"hashes": [
- "sha256:3718a569e431c9c2bc7e92e9156e25b7112dc0b9b461c8001fa481a00ccbd3bc",
- "sha256:baebe32bb29d6f188d849f38ecc43d04d5b5bad05db7f31dfdbe450f684042f0"
+ "sha256:1cd6daad89a1f920902cb68c5952c5ab5e80ba2bf4a8c3657c781b5b0f9d406b",
+ "sha256:d430de947575e321cedceecfdf198b8bd14db3026038b924547e8b1c7c6a09ed"
],
"index": "pypi",
- "version": "==0.3.8"
+ "version": "==0.3.10"
},
"websockets": {
"hashes": [
@@ -1266,11 +1297,11 @@
},
"pycodestyle": {
"hashes": [
- "sha256:2c9607871d58c76354b697b42f5d57e1ada7d261c261efac224b664affdc5785",
- "sha256:d1735fc58b418fd7c5f658d28d943854f8a849b01a5d0a1e6f3f3fdd0166804b"
+ "sha256:347187bdb476329d98f695c213d7295a846d1152ff4fe9bacb8a9590b8ee7053",
+ "sha256:8a4eaf0d0495c7395bdab3589ac2db602797d76207242c17d470186815706610"
],
"markers": "python_version >= '3.6'",
- "version": "==2.9.1"
+ "version": "==2.10.0"
},
"tomli": {
"hashes": [
diff --git a/README.md b/README.md
index 11ff002..7edc3fd 100644
--- a/README.md
+++ b/README.md
@@ -22,6 +22,12 @@ Use this to make sure you help making sure you did all the required steps:
* [ ] (optional for instagram) `instaloader.session` file which appears after the 1st run and login in telegram
* [ ] (optional for browsertrix) `profile.tar.gz` file
+### Private telegram channels
+* Cannot use bot token
+* Should have one with bot token, one without
+* Setup join all private invite links at the start
+*
+
## Setup
### Always required
1. [A Google Service account is necessary for use with `gspread`.](https://gspread.readthedocs.io/en/latest/oauth2.html#for-bots-using-service-account) Credentials for this account should be stored in `service_account.json`, in the same directory as the script.
diff --git a/orchestrate.yaml b/orchestration.example.yaml
similarity index 80%
rename from orchestrate.yaml
rename to orchestration.example.yaml
index 9626e83..7163829 100644
--- a/orchestrate.yaml
+++ b/orchestration.example.yaml
@@ -8,8 +8,8 @@ steps:
# - twitter
# - instagram
# - webarchive # this way it runs as a failsafe only
- enrichers:
- - screenshot
+ # enrichers:
+ # - screenshot
# - wacz
# - webarchive # this way it runs for every case, webarchive extends archiver and enrichment
# - thumbnails
@@ -29,11 +29,11 @@ configurations:
global:
- save_logs: False
gsheets_feeder:
- sheet: auto-archiver-test
+ sheet: my-auto-archiver
header: 2 # defaults to 1 in GSheetsFeeder
service_account: "secrets/service_account.json"
- allow_worksheets: "aa-refactor-tests"
- block_worksheets: "blocked,test-cases-008"
+ # allow_worksheets: "allowed"
+ # block_worksheets: "blocked1,blocked2"
columns:
'url': 'link'
'status': 'archive status'
@@ -49,6 +49,16 @@ configurations:
'hash': 'hash'
'wacz': 'wacz'
'replaywebpage': 'replaywebpage'
+ telethon:
+ api_id: "1234567"
+ api_hash: "examplehash"
+ session_file: "secrets/anon"
+ channel_invites:
+ - invite: https://t.me/+XXXXXXXXXXXXXX
+ id: 1000000000
+ - invite: https://t.me/joinchat/XXXXXXXXXXXXXX
+ id: 1000000001
+
tiktok:
api_keys:
- username: 1
@@ -60,7 +70,7 @@ configurations:
token: "here"
screenshot:
width: 1280
- height: 720000
+ height: 4600
wacz:
profile: secrets/profile.tar.gz
webarchive:
diff --git a/src/archivers/__init__.py b/src/archivers/__init__.py
index 7f51e39..a2cb67c 100644
--- a/src/archivers/__init__.py
+++ b/src/archivers/__init__.py
@@ -1,5 +1,6 @@
# we need to explicitly expose the available imports here
from .base_archiver import Archiver, ArchiveResult
+from .archiver import Archiverv2
from .telegram_archiver import TelegramArchiver
from .telethon_archiver import TelethonArchiver
from .tiktok_archiver import TiktokArchiver
@@ -8,4 +9,6 @@ from .youtubedl_archiver import YoutubeDLArchiver
from .twitter_archiver import TwitterArchiver
from .vk_archiver import VkArchiver
from .twitter_api_archiver import TwitterApiArchiver
-from .instagram_archiver import InstagramArchiver
\ No newline at end of file
+from .instagram_archiver import InstagramArchiver
+
+from .telethon_archiverv2 import TelethonArchiver
\ No newline at end of file
diff --git a/src/archivers/archiver.py b/src/archivers/archiver.py
new file mode 100644
index 0000000..538804e
--- /dev/null
+++ b/src/archivers/archiver.py
@@ -0,0 +1,26 @@
+from __future__ import annotations
+from abc import abstractmethod
+from dataclasses import dataclass
+from metadata import Metadata
+from steps.step import Step
+
+
+@dataclass
+class Archiverv2(Step):
+ name = "archiver"
+
+ def __init__(self, config: dict) -> None:
+ # without this STEP.__init__ is not called
+ super().__init__(config)
+ # self.setup()
+
+ # only for typing...
+ def init(name: str, config: dict) -> Archiverv2:
+ return Step.init(name, config, Archiverv2)
+
+ def setup(self) -> None:
+ # used when archivers need to login or do other one-time setup
+ pass
+
+ @abstractmethod
+ def download(self, item: Metadata) -> Metadata: pass
diff --git a/src/archivers/telethon_archiverv2.py b/src/archivers/telethon_archiverv2.py
new file mode 100644
index 0000000..a4273b0
--- /dev/null
+++ b/src/archivers/telethon_archiverv2.py
@@ -0,0 +1,114 @@
+from archivers import Archiverv2
+from metadata import Metadata
+from telethon.sync import TelegramClient
+from telethon.errors import ChannelInvalidError
+from telethon.tl.types import PeerUser, PeerChat, PeerChannel
+from telethon.tl.functions.messages import ImportChatInviteRequest
+from telethon.errors.rpcerrorlist import UserAlreadyParticipantError, FloodWaitError, InviteRequestSentError, InviteHashExpiredError
+from loguru import logger
+from tqdm import tqdm
+import re, time, json
+
+
+
+class TelethonArchiver(Archiverv2):
+ name = "telethon"
+ link_pattern = re.compile(r"https:\/\/t\.me(\/c){0,1}\/(.+)\/(\d+)")
+ invite_pattern = re.compile(r"t.me(\/joinchat){0,1}\/\+?(.+)")
+
+ def __init__(self, config: dict) -> None:
+ super().__init__(config)
+ assert self.api_id is not None and type(self.api_id) == str and len(self.api_id) > 0, f"invalid telethon api_id value ({self.api_id}) should be a valid string"
+ assert self.api_hash is not None and type(self.api_hash) == str and len(self.api_hash) > 0, f"invalid telethon api_hash value ({self.api_hash}) should be a valid string"
+
+ self.client = TelegramClient(self.session_file, self.api_id, self.api_hash)
+
+ @staticmethod
+ def configs() -> dict:
+ return {
+ "api_id": {"default": None, "help": "telegram API_ID value, go to https://my.telegram.org/apps"},
+ "api_hash": {"default": None, "help": "telegram API_HASH value, go to https://my.telegram.org/apps"},
+ # "bot_token": {"default": None, "help": "optional, but allows access to more content such as large videos, talk to @botfather"},
+ "session_file": {"default": "secrets/anon", "help": "optional, records the telegram login session for future usage"},
+ "channel_invites": {
+ "default": {},
+ "help": "(JSON string) private channel invite links (format: t.me/joinchat/HASH OR t.me/+HASH) and (optional but important to avoid hanging for minutes on startup) channel id (format: CHANNEL_ID taken from a post url like https://t.me/c/CHANNEL_ID/1), the telegram account will join any new channels on setup",
+ "cli_set": lambda cli_val, cur_val: dict(cur_val, **json.loads(cli_val))
+ }
+ }
+
+ def setup(self) -> None:
+ logger.info(f"SETUP {self.name} checking login...")
+ with self.client.start(): pass
+
+ if len(self.channel_invites):
+ logger.info(f"SETUP {self.name} joining channels...")
+ with self.client.start():
+ # get currently joined channels
+ # https://docs.telethon.dev/en/stable/modules/custom.html#module-telethon.tl.custom.dialog
+ joined_channel_ids = [c.id for c in self.client.get_dialogs() if c.is_channel]
+ logger.info(f"already part of {len(joined_channel_ids)} channels")
+
+ i = 0
+ pbar = tqdm(desc=f"joining {len(self.channel_invites)} invite links", total=len(self.channel_invites))
+ while i < len(self.channel_invites):
+ channel_invite = self.channel_invites[i]
+ channel_id = channel_invite.get("id", False)
+ invite = channel_invite["invite"]
+ if (match := self.invite_pattern.search(invite)):
+ try:
+ if channel_id:
+ ent = self.client.get_entity(int(channel_id)) # fails if not a member
+ else:
+ ent = self.client.get_entity(invite) # fails if not a member
+ logger.warning(f"please add the property id='{ent.id}' to the 'channel_invites' configuration where {invite=}, not doing so can lead to a minutes-long setup time due to telegram's rate limiting.")
+ except ValueError as e:
+ logger.info(f"joining new channel {invite=}")
+ try:
+ self.client(ImportChatInviteRequest(match.group(2)))
+ except UserAlreadyParticipantError as e:
+ logger.info(f"already joined {invite=}")
+ except InviteRequestSentError:
+ logger.warning(f"already sent a join request with {invite} still no answer")
+ except InviteHashExpiredError:
+ logger.warning(f"{invite=} has expired please find a more recent one")
+ except Exception as e:
+ logger.error(f"could not join channel with {invite=} due to {e}")
+ except FloodWaitError as e:
+ logger.warning(f"got a flood error, need to wait {e.seconds} seconds")
+ time.sleep(e.seconds)
+ continue
+ else:
+ logger.warning(f"Invalid invite link {invite}")
+ i+=1
+ pbar.update()
+
+
+ def download(self, item: Metadata) -> Metadata:
+ url = self.get_url(item)
+ print(f"downloading {url=}")
+ # detect URLs that we definitely cannot handle
+ match = self.link_pattern.search(url)
+ if not match: return False
+
+ # app will ask (stall for user input!) for phone number and auth code if anon.session not found
+ # TODO: not using bot_token since then private channels cannot be archived
+ # with self.client.start(bot_token=self.bot_token):
+ with self.client.start():
+ # self.client(ImportChatInviteRequest('4kAkN49IKJBhZDk6'))
+ is_private = match.group(1) == "/c"
+ print(f"{is_private=}")
+ chat = int(match.group(2)) if is_private else match.group(2)
+ post_id = int(match.group(3))
+
+ try:
+ post = self.client.get_messages(chat, ids=post_id)
+ except ValueError as e:
+ logger.error(f"Could not fetch telegram {url} possibly it's private: {e}")
+ return False
+ except ChannelInvalidError as e:
+ logger.error(f"Could not fetch telegram {url}. This error can be fixed if you setup a bot_token in addition to api_id and api_hash: {e}")
+ return False
+
+ if post is None: return False
+ print(post)
diff --git a/src/configs/v2config.py b/src/configs/v2config.py
index 50c8b0f..75a125e 100644
--- a/src/configs/v2config.py
+++ b/src/configs/v2config.py
@@ -3,9 +3,9 @@
import argparse, yaml
from dataclasses import dataclass, field
from typing import List
-from feeders.feeder import Feeder
+from archivers import Archiverv2
+from feeders import Feeder
from steps.step import Step
-from utils import Util
from enrichers import Enricher
from collections import defaultdict
@@ -16,10 +16,11 @@ class ConfigV2:
configurable_parents = [
Feeder,
Enricher,
+ Archiverv2,
# Util
]
feeder: Step # TODO:= BaseFeeder
- archivers: List[Step] = field(default_factory=[]) # TODO: fix type
+ archivers: List[Archiverv2] = field(default_factory=[]) # TODO: fix type
enrichers: List[Enricher] = field(default_factory=[])
formatters: List[Step] = field(default_factory=[]) # TODO: fix type
storages: List[Step] = field(default_factory=[]) # TODO: fix type
@@ -48,7 +49,7 @@ class ConfigV2:
assert "." not in child.name, f"class prop name cannot contain dots('.'): {child.name}"
assert "." not in config, f"config property cannot contain dots('.'): {config}"
config_path = f"{child.name}.{config}"
- parser.add_argument(f'--{config_path}', action='store', dest=config_path, help=details['help'])
+ parser.add_argument(f'--{config_path}', action='store', dest=config_path, help=f"{details['help']} (defaults to {details['default']})")
self.defaults[config_path] = details["default"]
if "cli_set" in details:
self.cli_ops[config_path] = details["cli_set"]
@@ -82,9 +83,11 @@ class ConfigV2:
self.feeder = Feeder.init(steps.get("feeder", "cli_feeder"), self.config)
self.enrichers = [Enricher.init(e, self.config) for e in steps.get("enrichers", [])]
+ self.archivers = [Archiverv2.init(e, self.config) for e in steps.get("archivers", [])]
print("feeder", self.feeder)
print("enrichers", [e for e in self.enrichers])
+ print("archivers", [e for e in self.archivers])
def validate(self):
pass
diff --git a/src/enrichers/enricher_screenshot.py b/src/enrichers/enricher_screenshot.py
index 04a2bf0..5018859 100644
--- a/src/enrichers/enricher_screenshot.py
+++ b/src/enrichers/enricher_screenshot.py
@@ -1,6 +1,9 @@
+from utils import Webdriver
from . import Enricher
from metadata import Metadata
from loguru import logger
+from selenium.common.exceptions import TimeoutException
+import time
class ScreenshotEnricher(Enricher):
@@ -11,43 +14,19 @@ class ScreenshotEnricher(Enricher):
return {
"width": {"default": 1280, "help": "width of the screenshots"},
"height": {"default": 720, "help": "height of the screenshots"},
+ "timeout": {"default": 60, "help": "timeout for taking the screenshot"}
}
def enrich(self, item: Metadata) -> Metadata:
url = self.get_url(item)
- print("enrich")
- # driver = config.webdriver
- # with driver as Webdriver(): # TODO: make a util
- # #TODO: take screenshot
- # pass
+ print(f"enriching {url=}")
+ with Webdriver(self.width, self.height, self.timeout, 'facebook.com' in url) as driver: # TODO: make a util
+ try:
+ driver.get(url)
+ time.sleep(2)
+ except TimeoutException:
+ logger.info("TimeoutException loading page for screenshot")
- # logger.debug(f"getting screenshot for {url=}")
- # key = self._get_key_from_url(url, ".png", append_datetime=True)
- # filename = os.path.join(Storage.TMP_FOLDER, key)
-
- # # Accept cookies popup dismiss for ytdlp video
- # if 'facebook.com' in url:
- # try:
- # logger.debug(f'Trying fb click accept cookie popup for {url}')
- # self.driver.get("http://www.facebook.com")
- # foo = self.driver.find_element(By.XPATH, "//button[@data-cookiebanner='accept_only_essential_button']")
- # foo.click()
- # logger.debug(f'fb click worked')
- # # linux server needs a sleep otherwise facebook cookie won't have worked and we'll get a popup on next page
- # time.sleep(2)
- # except:
- # logger.warning(f'Failed on fb accept cookies for url {url}')
-
- # try:
- # self.driver.get(url)
- # time.sleep(6)
- # except TimeoutException:
- # logger.info("TimeoutException loading page for screenshot")
-
- # self.driver.save_screenshot(filename)
- # self.storage.upload(filename, key, extra_args={'ACL': 'public-read', 'ContentType': 'image/png'})
-
- # cdn_url = self.storage.get_cdn_url(key)
- # self.add_to_media(cdn_url, key)
-
- # return cdn_url
+ #TODO: return saved object
+ driver.save_screenshot("TODO-HASH_OR_UUID.png")
+ return None
diff --git a/src/feeders/feeder_gsheet.py b/src/feeders/feeder_gsheet.py
index a99376f..ad28af1 100644
--- a/src/feeders/feeder_gsheet.py
+++ b/src/feeders/feeder_gsheet.py
@@ -4,7 +4,7 @@ import json, gspread
from loguru import logger
# from . import Enricher
-from feeders.feeder import Feeder
+from feeders import Feeder
from steps.gsheet import Gsheets
from utils import GWorksheet
@@ -30,7 +30,7 @@ class GsheetsFeeder(Gsheets, Feeder):
},
"block_worksheets": {
"default": set(),
- "help": "(CSV) explicitly block some worksheets from being processed, defaults to empty",
+ "help": "(CSV) explicitly block some worksheets from being processed",
"cli_set": lambda cli_val, cur_val: set(cli_val.split(","))
}
})
diff --git a/src/metadata.py b/src/metadata.py
index 39b62ff..d56fcd9 100644
--- a/src/metadata.py
+++ b/src/metadata.py
@@ -1,6 +1,6 @@
from __future__ import annotations
-from typing import Union, Dict
+from typing import Any, Union, Dict
from dataclasses import dataclass
@@ -12,18 +12,28 @@ class Metadata:
# title: str
# url: str
# hash: str
- metadata: Dict[str, Metadata]
+ metadata: Dict[str, Any]
- @staticmethod
- def merge(left: Metadata, right: Metadata, overwrite_left=True) -> Metadata:
+ # TODO: remove and use default?
+ def __init__(self) -> None:
+ self.status = ""
+ self.metadata = {}
+
+ # @staticmethod
+ def merge(self: Metadata, right: Metadata, overwrite_left=True) -> Metadata:
# should return a merged version of the Metadata
# will work for archived() and enriched()
# what if 2 metadatas contain the same keys? only one can remain! : overwrite_left
pass
- def get(self, key: str) -> Union[Metadata, str]:
+ # TODO: setters?
+ def set(self, key: str, val: Any) -> Union[Metadata, str]:
# goes through metadata and returns the Metadata available
- pass
+ self.metadata[key] = val
+
+ def get(self, key: str, default: Any = None) -> Union[Metadata, str]:
+ # goes through metadata and returns the Metadata available
+ return self.metadata.get(key, default)
def as_json(self) -> str:
# converts all metadata and data into JSON
diff --git a/src/orchestrator.py b/src/orchestrator.py
index f32f4c9..5889497 100644
--- a/src/orchestrator.py
+++ b/src/orchestrator.py
@@ -1,8 +1,11 @@
from __future__ import annotations
+from ast import List
from typing import Union, Dict
from dataclasses import dataclass
+from archivers.archiver import Archiverv2
from enrichers.enricher import Enricher
+from metadata import Metadata
"""
how not to couple the different pieces of logic
@@ -108,12 +111,13 @@ Once an archiver returns a link to a local file (for eg to a storage), how do we
The context metadata should include a temporary folder (maybe a LocalStorage instance?)
"""
+
class ArchivingOrchestrator:
def __init__(self, config) -> None:
# in config.py we should test that the archivers exist and log mismatches (blocking execution)
# identify each formatter, storage, database, etc
# self.feeder = Feeder.init(config.feeder, config.get(config.feeder))
-
+
# Is it possible to overwrite config.yaml values? it could be useful: share config file and modify gsheets_feeder.sheet via CLI
# where does that update/processing happen? in config.py
# reflection for Archiver to know wihch child classes it has? use Archiver.__subclasses__
@@ -123,7 +127,13 @@ class ArchivingOrchestrator:
# ]
self.feeder = config.feeder
self.enrichers = config.enrichers
+ self.archivers: List[Archiverv2] = config.archivers
+ for a in self.archivers: a.setup()
+
+ self.formatters = []
+ self.storages = []
+ self.databases = []
# self.formatters = [
# Formatter.init(f, config)
# for f in config.formatters
@@ -145,30 +155,33 @@ class ArchivingOrchestrator:
def feed(self) -> list(ArchiveResult):
for url in self.feeder:
print("ARCHIVING", url)
- # self.archive(url)
+ self.archive(url)
# how does this handle the parameters like folder which can be different for each archiver?
# the storage needs to know where to archive!!
- # solution: feeders have context: extra metadata that they can read or ignore,
+ # solution: feeders have context: extra metadata that they can read or ignore,
# all of it should have sensible defaults (eg: folder)
# default feeder is a list with 1 element
def archive(self, url) -> Union[ArchiveResult, None]:
- url = clear_url(url)
- result = Metadata(url=url)
-
+ # TODO:
+ # url = clear_url(url)
+ # result = Metadata(url=url)
+ result = Metadata()
+ result.set("url", url)
should_archive = True
- for d in databases: should_archive &= d.should_process(url)
+ for d in self.databases: should_archive &= d.should_process(url)
# should storages also be able to check?
- for s in storages: should_archive &= s.should_process(url)
+ for s in self.storages: should_archive &= s.should_process(url)
if not should_archive:
+ print("skipping")
return "skipping"
# signal to DB that archiving has started
- for d in databases:
+ for d in self.databases:
# are the databases to decide whether to archive?
- # they can simply return True by default, otherwise they can avoid duplicates. should this logic be more granular, for example on the archiver level: a tweet will not need be scraped twice, whereas an instagram profile might. the archiver could not decide from the link which parts to archive,
+ # they can simply return True by default, otherwise they can avoid duplicates. should this logic be more granular, for example on the archiver level: a tweet will not need be scraped twice, whereas an instagram profile might. the archiver could not decide from the link which parts to archive,
# instagram profile example: it would always re-archive everything
# maybe the database/storage could use a hash/key to decide if there's a need to re-archive
if d.should_process(url):
@@ -180,15 +193,15 @@ class ArchivingOrchestrator:
return
# vk, telethon, ...
- for a in archivers:
+ for a in self.archivers:
# with automatic try/catch in download + archived (+ the other ops below)
- # should the archivers come with the config already? are there configs which change at runtime?
+ # should the archivers come with the config already? are there configs which change at runtime?
# think not, so no need to pass config as parameter
- # do they need to be refreshed with every execution?
+ # do they need to be refreshed with every execution?
# this is where the Hashes come from, the place with access to all content
# the archiver does not have access to storage
- result.update(a.download(url))
- if result.is_success(): break
+ result.merge(a.download(result))
+ if True or result.is_success(): break
# what if an archiver returns multiple entries and one is to be part of HTMLgenerator?
# should it call the HTMLgenerator as if it's not an enrichment?
@@ -196,20 +209,20 @@ class ArchivingOrchestrator:
# then how to execute it last? should there also be post-processors? are there other examples?
# maybe as a PDF? or a Markdown file
# side captures: screenshot, wacz, webarchive, thumbnails, HTMLgenerator
- for e in enrichers:
- result.update(e.enrich(result))
+ for e in self.enrichers:
+ result.merge(e.enrich(result))
# formatters, enrichers, and storages will sometimes look for specific properties: eg
Screenshot:
- for p in formatter:
- result.update(p.process(result))
+ for f in self.formatters:
+ result.merge(f.format(result))
# storages
- for s in storages:
+ for s in self.storages:
for m in result.media:
- m.update(s.store(m))
+ m.merge(s.store(m))
# signal completion to databases (DBs, Google Sheets, CSV, ...)
# a hash registration service could be one database: forensic archiving
- for d in databases: d.done( result)
+ for d in self.databases: d.done(result)
- return result
\ No newline at end of file
+ return result
diff --git a/src/steps/gsheet.py b/src/steps/gsheet.py
index 9654da4..279c036 100644
--- a/src/steps/gsheet.py
+++ b/src/steps/gsheet.py
@@ -36,7 +36,7 @@ class Gsheets(Step):
'wacz': 'wacz',
'replaywebpage': 'replaywebpage',
},
- "help": "names of columns in the google sheet",
+ "help": "names of columns in the google sheet (stringified JSON object)",
"cli_set": lambda cli_val, cur_val: dict(cur_val, **json.loads(cli_val))
},
}
\ No newline at end of file
diff --git a/src/utils/__init__.py b/src/utils/__init__.py
index ad56f36..9aff525 100644
--- a/src/utils/__init__.py
+++ b/src/utils/__init__.py
@@ -1,4 +1,5 @@
# we need to explicitly expose the available imports here
from .gworksheet import GWorksheet
from .misc import *
-from .util import Util
\ No newline at end of file
+from .util import Util
+from .webdriver import Webdriver
\ No newline at end of file
diff --git a/src/utils/util.py b/src/utils/util.py
index 714d499..e465bda 100644
--- a/src/utils/util.py
+++ b/src/utils/util.py
@@ -1,11 +1,12 @@
from __future__ import annotations
+from abc import abstractmethod
from dataclasses import dataclass
-from abc import abstractmethod, ABC
from metadata import Metadata
from steps.step import Step
+#TODO: likely unused
@dataclass
-class Util(Step, ABC):
+class Util(Step):
name = "util"
def __init__(self, config: dict) -> None:
diff --git a/src/utils/webdriver.py b/src/utils/webdriver.py
new file mode 100644
index 0000000..5ce0374
--- /dev/null
+++ b/src/utils/webdriver.py
@@ -0,0 +1,45 @@
+from __future__ import annotations
+from selenium import webdriver
+from selenium.common.exceptions import TimeoutException
+from loguru import logger
+from selenium.webdriver.common.by import By
+import time
+
+
+class Webdriver:
+ def __init__(self, width: int, height: int, timeout_seconds: int, facebook_accept_cookies: bool = False) -> webdriver:
+ self.width = width
+ self.height = height
+ self.timeout_seconds = timeout_seconds
+ self.facebook_accept_cookies = facebook_accept_cookies
+
+ def __enter__(self) -> webdriver:
+ options = webdriver.FirefoxOptions()
+ options.headless = True
+ options.set_preference('network.protocol-handler.external.tg', False)
+ try:
+ self.driver = webdriver.Firefox(options=options)
+ self.driver.set_window_size(self.width, self.height)
+ self.driver.set_page_load_timeout(self.timeout_seconds)
+ except TimeoutException as e:
+ logger.error(f"failed to get new webdriver, possibly due to insufficient system resources or timeout settings: {e}")
+
+ if self.facebook_accept_cookies:
+ try:
+ logger.debug(f'Trying fb click accept cookie popup.')
+ self.driver.get("http://www.facebook.com")
+ foo = self.driver.find_element(By.XPATH, "//button[@data-cookiebanner='accept_only_essential_button']")
+ foo.click()
+ logger.debug(f'fb click worked')
+ # linux server needs a sleep otherwise facebook cookie won't have worked and we'll get a popup on next page
+ time.sleep(2)
+ except:
+ logger.warning(f'Failed on fb accept cookies.')
+
+ return self.driver
+
+ def __exit__(self, exc_type, exc_val, exc_tb):
+ self.driver.close()
+ self.driver.quit()
+ del self.driver
+ return True
From 53ffa2d4aee2e86b181eb8e2f19ff1fc33ac3456 Mon Sep 17 00:00:00 2001
From: msramalho <19508417+msramalho@users.noreply.github.com>
Date: Wed, 14 Dec 2022 15:37:34 +0000
Subject: [PATCH 026/190] telethon_archiver working for multiple media
---
src/archivers/archiver.py | 21 ++++++-
src/archivers/telethon_archiverv2.py | 86 ++++++++++++++++++++++------
src/metadata.py | 32 ++++++++++-
src/orchestrator.py | 12 +++-
src/steps/step.py | 5 --
5 files changed, 125 insertions(+), 31 deletions(-)
diff --git a/src/archivers/archiver.py b/src/archivers/archiver.py
index 538804e..37f5d4d 100644
--- a/src/archivers/archiver.py
+++ b/src/archivers/archiver.py
@@ -3,6 +3,7 @@ from abc import abstractmethod
from dataclasses import dataclass
from metadata import Metadata
from steps.step import Step
+import mimetypes, requests
@dataclass
@@ -12,9 +13,9 @@ class Archiverv2(Step):
def __init__(self, config: dict) -> None:
# without this STEP.__init__ is not called
super().__init__(config)
- # self.setup()
# only for typing...
+
def init(name: str, config: dict) -> Archiverv2:
return Step.init(name, config, Archiverv2)
@@ -22,5 +23,23 @@ class Archiverv2(Step):
# used when archivers need to login or do other one-time setup
pass
+ def _guess_file_type(self, path: str) -> str:
+ """
+ Receives a URL or filename and returns global mimetype like 'image' or 'video'
+ see https://developer.mozilla.org/en-US/docs/Web/HTTP/Basics_of_HTTP/MIME_types/Common_types
+ """
+ mime = mimetypes.guess_type(path)[0]
+ if mime is not None:
+ return mime.split("/")[0]
+ return ""
+
+ def download_from_url(self, url:str, to_filename:str) -> None:
+ headers = {
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36'
+ }
+ d = requests.get(url, headers=headers)
+ with open(to_filename, 'wb') as f:
+ f.write(d.content)
+
@abstractmethod
def download(self, item: Metadata) -> Metadata: pass
diff --git a/src/archivers/telethon_archiverv2.py b/src/archivers/telethon_archiverv2.py
index a4273b0..267dc2d 100644
--- a/src/archivers/telethon_archiverv2.py
+++ b/src/archivers/telethon_archiverv2.py
@@ -7,8 +7,7 @@ from telethon.tl.functions.messages import ImportChatInviteRequest
from telethon.errors.rpcerrorlist import UserAlreadyParticipantError, FloodWaitError, InviteRequestSentError, InviteHashExpiredError
from loguru import logger
from tqdm import tqdm
-import re, time, json
-
+import re, time, json, os
class TelethonArchiver(Archiverv2):
@@ -38,6 +37,10 @@ class TelethonArchiver(Archiverv2):
}
def setup(self) -> None:
+ """
+ 1. trigger login process for telegram or proceed if already saved in a session file
+ 2. joins channel_invites where needed
+ """
logger.info(f"SETUP {self.name} checking login...")
with self.client.start(): pass
@@ -56,11 +59,11 @@ class TelethonArchiver(Archiverv2):
channel_id = channel_invite.get("id", False)
invite = channel_invite["invite"]
if (match := self.invite_pattern.search(invite)):
- try:
+ try:
if channel_id:
- ent = self.client.get_entity(int(channel_id)) # fails if not a member
+ ent = self.client.get_entity(int(channel_id)) # fails if not a member
else:
- ent = self.client.get_entity(invite) # fails if not a member
+ ent = self.client.get_entity(invite) # fails if not a member
logger.warning(f"please add the property id='{ent.id}' to the 'channel_invites' configuration where {invite=}, not doing so can lead to a minutes-long setup time due to telegram's rate limiting.")
except ValueError as e:
logger.info(f"joining new channel {invite=}")
@@ -80,35 +83,80 @@ class TelethonArchiver(Archiverv2):
continue
else:
logger.warning(f"Invalid invite link {invite}")
- i+=1
+ i += 1
pbar.update()
-
def download(self, item: Metadata) -> Metadata:
- url = self.get_url(item)
+ url = item.get_url()
+
print(f"downloading {url=}")
# detect URLs that we definitely cannot handle
match = self.link_pattern.search(url)
if not match: return False
- # app will ask (stall for user input!) for phone number and auth code if anon.session not found
- # TODO: not using bot_token since then private channels cannot be archived
- # with self.client.start(bot_token=self.bot_token):
- with self.client.start():
- # self.client(ImportChatInviteRequest('4kAkN49IKJBhZDk6'))
- is_private = match.group(1) == "/c"
- print(f"{is_private=}")
- chat = int(match.group(2)) if is_private else match.group(2)
- post_id = int(match.group(3))
+ is_private = match.group(1) == "/c"
+ chat = int(match.group(2)) if is_private else match.group(2)
+ post_id = int(match.group(3))
+ result = Metadata()
+
+ # NB: not using bot_token since then private channels cannot be archived: self.client.start(bot_token=self.bot_token)
+ with self.client.start():
try:
post = self.client.get_messages(chat, ids=post_id)
except ValueError as e:
logger.error(f"Could not fetch telegram {url} possibly it's private: {e}")
return False
except ChannelInvalidError as e:
- logger.error(f"Could not fetch telegram {url}. This error can be fixed if you setup a bot_token in addition to api_id and api_hash: {e}")
+ logger.error(f"Could not fetch telegram {url}. This error may be fixed if you setup a bot_token in addition to api_id and api_hash (but then private channels will not be archived, we need to update this logic to handle both): {e}")
return False
if post is None: return False
- print(post)
+ logger.info(f"fetched telegram {post.id=}")
+
+ media_posts = self._get_media_posts_in_group(chat, post)
+ logger.debug(f'got {len(media_posts)=} for {url=}')
+
+ tmp_dir = item.get("tmp_dir")
+
+ group_id = post.grouped_id if post.grouped_id is not None else post.id
+ title = post.message
+ for mp in media_posts:
+ if len(mp.message) > len(title): title = mp.message # save the longest text found (usually only 1)
+
+ # media can also be in entities
+ if mp.entities:
+ other_media_urls = [e.url for e in mp.entities if hasattr(e, "url") and e.url and self._guess_file_type(e.url) in ["video", "image"]]
+ logger.debug(f"Got {len(other_media_urls)} other medial urls from {mp.id=}: {other_media_urls}")
+ for om_url in other_media_urls:
+ filename = os.path.join(tmp_dir, f'{chat}_{group_id}_{self._get_key_from_url(om_url)}')
+ self.download_from_url(om_url, filename)
+ result.add_media(filename)
+
+ filename_dest = os.path.join(tmp_dir, f'{chat}_{group_id}', str(mp.id))
+ filename = self.client.download_media(mp.media, filename_dest)
+ if not filename:
+ logger.debug(f"Empty media found, skipping {str(mp)=}")
+ continue
+ result.add_media(filename)
+
+ result.set("post", post).set_title(title).set_timestamp(post.date)
+ return result
+
+ def _get_media_posts_in_group(self, chat, original_post, max_amp=10):
+ """
+ Searches for Telegram posts that are part of the same group of uploads
+ The search is conducted around the id of the original post with an amplitude
+ of `max_amp` both ways
+ Returns a list of [post] where each post has media and is in the same grouped_id
+ """
+ if getattr(original_post, "grouped_id", None) is None:
+ return [original_post] if getattr(original_post, "media", False) else []
+
+ search_ids = [i for i in range(original_post.id - max_amp, original_post.id + max_amp + 1)]
+ posts = self.client.get_messages(chat, ids=search_ids)
+ media = []
+ for post in posts:
+ if post is not None and post.grouped_id == original_post.grouped_id and post.media is not None:
+ media.append(post)
+ return media
diff --git a/src/metadata.py b/src/metadata.py
index d56fcd9..e1e8d8b 100644
--- a/src/metadata.py
+++ b/src/metadata.py
@@ -1,7 +1,9 @@
from __future__ import annotations
+from ast import List
from typing import Any, Union, Dict
from dataclasses import dataclass
+from datetime import datetime
@dataclass
@@ -15,8 +17,8 @@ class Metadata:
metadata: Dict[str, Any]
# TODO: remove and use default?
- def __init__(self) -> None:
- self.status = ""
+ def __init__(self, status="") -> None:
+ self.status = status
self.metadata = {}
# @staticmethod
@@ -27,14 +29,38 @@ class Metadata:
pass
# TODO: setters?
- def set(self, key: str, val: Any) -> Union[Metadata, str]:
+ def set(self, key: str, val: Any) -> Metadata:
# goes through metadata and returns the Metadata available
self.metadata[key] = val
+ return self
def get(self, key: str, default: Any = None) -> Union[Metadata, str]:
# goes through metadata and returns the Metadata available
return self.metadata.get(key, default)
+# custom getter/setters
+
+ def set_url(self, url: str) -> Metadata:
+ assert type(url) is str and len(url) > 0, "invalid URL"
+ return self.set("url", url)
+
+ def get_url(self) -> str:
+ url = self.get("url")
+ assert type(url) is str and len(url) > 0, "invalid URL"
+ return url
+
+ def get_media(self) -> List:
+ return self.get("media", [])
+
+ def set_title(self, title: str) -> Metadata:
+ return self.set("title", title)
+
+ def set_timestamp(self, title: datetime) -> Metadata:
+ return self.set("title", title)
+
+ def add_media(self, filename: str) -> Metadata:
+ return self.get_media().append(filename)
+
def as_json(self) -> str:
# converts all metadata and data into JSON
pass
diff --git a/src/orchestrator.py b/src/orchestrator.py
index 5889497..804948e 100644
--- a/src/orchestrator.py
+++ b/src/orchestrator.py
@@ -6,6 +6,7 @@ from archivers.archiver import Archiverv2
from enrichers.enricher import Enricher
from metadata import Metadata
+import tempfile, time
"""
how not to couple the different pieces of logic
@@ -155,19 +156,24 @@ class ArchivingOrchestrator:
def feed(self) -> list(ArchiveResult):
for url in self.feeder:
print("ARCHIVING", url)
- self.archive(url)
+ with tempfile.TemporaryDirectory(dir="./") as tmp_dir:
+ self.archive(url, tmp_dir)
+
+ print("holding on")
+ time.sleep(300)
# how does this handle the parameters like folder which can be different for each archiver?
# the storage needs to know where to archive!!
# solution: feeders have context: extra metadata that they can read or ignore,
# all of it should have sensible defaults (eg: folder)
# default feeder is a list with 1 element
- def archive(self, url) -> Union[ArchiveResult, None]:
+ def archive(self, url: str, tmp_dir: str) -> Union[Metadata, None]:
# TODO:
# url = clear_url(url)
# result = Metadata(url=url)
result = Metadata()
- result.set("url", url)
+ result.set_url(url)
+ result.set("tmp_dir", tmp_dir)
should_archive = True
for d in self.databases: should_archive &= d.should_process(url)
diff --git a/src/steps/step.py b/src/steps/step.py
index 04d7a61..4d7e6c1 100644
--- a/src/steps/step.py
+++ b/src/steps/step.py
@@ -29,8 +29,3 @@ class Step(ABC):
print(sub.name, "CALLING NEW")
return sub(config)
raise ClassFoundException(f"Unable to initialize STEP with {name=}")
-
- def get_url(self, item: Metadata) -> str:
- url = item.get("url")
- assert type(url) is str and len(url) > 0
- return url
From 9c056d001c3cf5e0d26156e07ad62fe7d1bbecc3 Mon Sep 17 00:00:00 2001
From: msramalho <19508417+msramalho@users.noreply.github.com>
Date: Wed, 14 Dec 2022 16:11:06 +0000
Subject: [PATCH 027/190] merge logic started
---
src/archivers/telethon_archiverv2.py | 13 ++++++----
src/metadata.py | 38 ++++++++++++++++++++++------
src/orchestrator.py | 5 ++--
src/steps/step.py | 4 +--
4 files changed, 43 insertions(+), 17 deletions(-)
diff --git a/src/archivers/telethon_archiverv2.py b/src/archivers/telethon_archiverv2.py
index 267dc2d..4fa3ce0 100644
--- a/src/archivers/telethon_archiverv2.py
+++ b/src/archivers/telethon_archiverv2.py
@@ -87,9 +87,11 @@ class TelethonArchiver(Archiverv2):
pbar.update()
def download(self, item: Metadata) -> Metadata:
+ """
+ if this url is archivable will download post info and look for other posts from the same group with media.
+ can handle private/public channels
+ """
url = item.get_url()
-
- print(f"downloading {url=}")
# detect URLs that we definitely cannot handle
match = self.link_pattern.search(url)
if not match: return False
@@ -126,8 +128,9 @@ class TelethonArchiver(Archiverv2):
# media can also be in entities
if mp.entities:
- other_media_urls = [e.url for e in mp.entities if hasattr(e, "url") and e.url and self._guess_file_type(e.url) in ["video", "image"]]
- logger.debug(f"Got {len(other_media_urls)} other medial urls from {mp.id=}: {other_media_urls}")
+ other_media_urls = [e.url for e in mp.entities if hasattr(e, "url") and e.url and self._guess_file_type(e.url) in ["video", "image", "audio"]]
+ if len(other_media_urls):
+ logger.debug(f"Got {len(other_media_urls)} other medial urls from {mp.id=}: {other_media_urls}")
for om_url in other_media_urls:
filename = os.path.join(tmp_dir, f'{chat}_{group_id}_{self._get_key_from_url(om_url)}')
self.download_from_url(om_url, filename)
@@ -140,7 +143,7 @@ class TelethonArchiver(Archiverv2):
continue
result.add_media(filename)
- result.set("post", post).set_title(title).set_timestamp(post.date)
+ result.set("post", str(post)).set_title(title).set_timestamp(post.date)
return result
def _get_media_posts_in_group(self, chat, original_post, max_amp=10):
diff --git a/src/metadata.py b/src/metadata.py
index e1e8d8b..193003f 100644
--- a/src/metadata.py
+++ b/src/metadata.py
@@ -4,6 +4,7 @@ from ast import List
from typing import Any, Union, Dict
from dataclasses import dataclass
from datetime import datetime
+import json
@dataclass
@@ -21,12 +22,25 @@ class Metadata:
self.status = status
self.metadata = {}
- # @staticmethod
def merge(self: Metadata, right: Metadata, overwrite_left=True) -> Metadata:
- # should return a merged version of the Metadata
- # will work for archived() and enriched()
- # what if 2 metadatas contain the same keys? only one can remain! : overwrite_left
- pass
+ """
+ merges to Metadata instances, will overwrite according to overwrite_left flag
+ """
+ res = Metadata()
+ if overwrite_left:
+ res.status = right.status
+ res.metadata = dict(self.metadata) # make a copy
+ for k, v in right.metadata.items():
+ print(type(v), type(self.get(k)))
+ # assert type(v) == type(self.get(k))
+ if type(v) not in [dict, list, set] or k not in res.metadata:
+ res.set(k, v)
+ else: # key conflict
+ if type(v) in [dict, set]: res.set(k, self.get(k) | v)
+ elif type(v) == list: res.set(k, self.get(k) + v)
+ else: # invert and do same logic
+ return right.merge(self)
+ return res
# TODO: setters?
def set(self, key: str, val: Any) -> Metadata:
@@ -34,8 +48,10 @@ class Metadata:
self.metadata[key] = val
return self
- def get(self, key: str, default: Any = None) -> Union[Metadata, str]:
+ def get(self, key: str, default: Any = None, create_if_missing=False) -> Union[Metadata, str]:
# goes through metadata and returns the Metadata available
+ if create_if_missing and key not in self.metadata:
+ self.metadata[key] = default
return self.metadata.get(key, default)
# custom getter/setters
@@ -50,7 +66,11 @@ class Metadata:
return url
def get_media(self) -> List:
- return self.get("media", [])
+ return self.get("media", [], create_if_missing=True)
+
+ def set_content(self, content: str) -> Metadata:
+ # the main textual content/information from a social media post, webpage, ...
+ return self.set("content", content)
def set_title(self, title: str) -> Metadata:
return self.set("title", title)
@@ -59,8 +79,10 @@ class Metadata:
return self.set("title", title)
def add_media(self, filename: str) -> Metadata:
+ # print(f"adding {filename} to {self.metadata.get('media')}")
+ # return self.set("media", self.get_media() + [filename])
return self.get_media().append(filename)
def as_json(self) -> str:
# converts all metadata and data into JSON
- pass
+ return json.dumps(self.metadata)
diff --git a/src/orchestrator.py b/src/orchestrator.py
index 804948e..9a523bf 100644
--- a/src/orchestrator.py
+++ b/src/orchestrator.py
@@ -157,8 +157,9 @@ class ArchivingOrchestrator:
for url in self.feeder:
print("ARCHIVING", url)
with tempfile.TemporaryDirectory(dir="./") as tmp_dir:
- self.archive(url, tmp_dir)
-
+ result = self.archive(url, tmp_dir)
+ print(result)
+ print(result.as_json())
print("holding on")
time.sleep(300)
# how does this handle the parameters like folder which can be different for each archiver?
diff --git a/src/steps/step.py b/src/steps/step.py
index 4d7e6c1..7a2135c 100644
--- a/src/steps/step.py
+++ b/src/steps/step.py
@@ -22,10 +22,10 @@ class Step(ABC):
def init(name: str, config: dict, child: Type[Step]) -> Step:
"""
- cannot find subclasses of child.subclasses
+ looks into direct subclasses of child for name and returns such ab object
+ TODO: cannot find subclasses of child.subclasses
"""
for sub in child.__subclasses__():
if sub.name == name:
- print(sub.name, "CALLING NEW")
return sub(config)
raise ClassFoundException(f"Unable to initialize STEP with {name=}")
From 96845305a3ff265fe2690c91ab577fef659a7992 Mon Sep 17 00:00:00 2001
From: msramalho <19508417+msramalho@users.noreply.github.com>
Date: Wed, 14 Dec 2022 19:01:20 +0000
Subject: [PATCH 028/190] media concept implemented
---
src/archivers/telethon_archiverv2.py | 10 ++--
src/media.py | 17 +++++++
src/metadata.py | 63 +++++++++++---------------
src/orchestrator.py | 7 +--
src/storages/__init__.py | 4 +-
src/storages/s3.py | 68 ++++++++++++++++++++++++++++
src/storages/storage.py | 21 +++++++++
7 files changed, 145 insertions(+), 45 deletions(-)
create mode 100644 src/media.py
create mode 100644 src/storages/s3.py
create mode 100644 src/storages/storage.py
diff --git a/src/archivers/telethon_archiverv2.py b/src/archivers/telethon_archiverv2.py
index 4fa3ce0..ea19c92 100644
--- a/src/archivers/telethon_archiverv2.py
+++ b/src/archivers/telethon_archiverv2.py
@@ -9,6 +9,8 @@ from loguru import logger
from tqdm import tqdm
import re, time, json, os
+from media import Media
+
class TelethonArchiver(Archiverv2):
name = "telethon"
@@ -131,17 +133,17 @@ class TelethonArchiver(Archiverv2):
other_media_urls = [e.url for e in mp.entities if hasattr(e, "url") and e.url and self._guess_file_type(e.url) in ["video", "image", "audio"]]
if len(other_media_urls):
logger.debug(f"Got {len(other_media_urls)} other medial urls from {mp.id=}: {other_media_urls}")
- for om_url in other_media_urls:
- filename = os.path.join(tmp_dir, f'{chat}_{group_id}_{self._get_key_from_url(om_url)}')
+ for i, om_url in enumerate(other_media_urls):
+ filename = os.path.join(tmp_dir, f'{chat}_{group_id}_{i}')
self.download_from_url(om_url, filename)
- result.add_media(filename)
+ result.add_media(Media(filename))
filename_dest = os.path.join(tmp_dir, f'{chat}_{group_id}', str(mp.id))
filename = self.client.download_media(mp.media, filename_dest)
if not filename:
logger.debug(f"Empty media found, skipping {str(mp)=}")
continue
- result.add_media(filename)
+ result.add_media(Media(filename))
result.set("post", str(post)).set_title(title).set_timestamp(post.date)
return result
diff --git a/src/media.py b/src/media.py
new file mode 100644
index 0000000..ecee4f4
--- /dev/null
+++ b/src/media.py
@@ -0,0 +1,17 @@
+
+from __future__ import annotations
+from ast import List
+from typing import Any, Union, Dict
+from dataclasses import dataclass
+from datetime import datetime
+import json
+
+
+@dataclass
+class Media:
+ filename: str
+ id: str = None
+ hash: str = None
+ cdn_url: str = None
+ hash: str = None
+
diff --git a/src/metadata.py b/src/metadata.py
index 193003f..8945e1a 100644
--- a/src/metadata.py
+++ b/src/metadata.py
@@ -2,49 +2,38 @@
from __future__ import annotations
from ast import List
from typing import Any, Union, Dict
-from dataclasses import dataclass
+from dataclasses import dataclass, field
from datetime import datetime
-import json
+# import json
+
+from media import Media
@dataclass
class Metadata:
- # does not handle files, only primitives
- # the only piece of logic to handle files is the archiver, enricher, and storage
- status: str
- # title: str
- # url: str
- # hash: str
- metadata: Dict[str, Any]
-
- # TODO: remove and use default?
- def __init__(self, status="") -> None:
- self.status = status
- self.metadata = {}
+ status: str = ""
+ metadata: Dict[str, Any] = field(default_factory=dict)
+ media: List[Media] = field(default_factory=list)
def merge(self: Metadata, right: Metadata, overwrite_left=True) -> Metadata:
"""
- merges to Metadata instances, will overwrite according to overwrite_left flag
+ merges two Metadata instances, will overwrite according to overwrite_left flag
"""
- res = Metadata()
if overwrite_left:
- res.status = right.status
- res.metadata = dict(self.metadata) # make a copy
+ self.status = right.status
for k, v in right.metadata.items():
- print(type(v), type(self.get(k)))
- # assert type(v) == type(self.get(k))
- if type(v) not in [dict, list, set] or k not in res.metadata:
- res.set(k, v)
+ assert k not in self.metadata or type(v) == type(self.get(k))
+ if type(v) not in [dict, list, set] or k not in self.metadata:
+ self.set(k, v)
else: # key conflict
- if type(v) in [dict, set]: res.set(k, self.get(k) | v)
- elif type(v) == list: res.set(k, self.get(k) + v)
+ if type(v) in [dict, set]: self.set(k, self.get(k) | v)
+ elif type(v) == list: self.set(k, self.get(k) + v)
+ self.media.extend(right.media)
else: # invert and do same logic
return right.merge(self)
- return res
+ return self
- # TODO: setters?
def set(self, key: str, val: Any) -> Metadata:
- # goes through metadata and returns the Metadata available
self.metadata[key] = val
return self
@@ -65,9 +54,6 @@ class Metadata:
assert type(url) is str and len(url) > 0, "invalid URL"
return url
- def get_media(self) -> List:
- return self.get("media", [], create_if_missing=True)
-
def set_content(self, content: str) -> Metadata:
# the main textual content/information from a social media post, webpage, ...
return self.set("content", content)
@@ -75,14 +61,17 @@ class Metadata:
def set_title(self, title: str) -> Metadata:
return self.set("title", title)
- def set_timestamp(self, title: datetime) -> Metadata:
- return self.set("title", title)
+ def set_timestamp(self, timestamp: datetime) -> Metadata:
+ assert type(timestamp) == datetime, "set_timestamp expects a datetime instance"
+ return self.set("timestamp", timestamp)
- def add_media(self, filename: str) -> Metadata:
+ def add_media(self, media: Media) -> Metadata:
# print(f"adding {filename} to {self.metadata.get('media')}")
# return self.set("media", self.get_media() + [filename])
- return self.get_media().append(filename)
+ # return self.get_media().append(media)
+ return self.media.append(media)
- def as_json(self) -> str:
- # converts all metadata and data into JSON
- return json.dumps(self.metadata)
+ # def as_json(self) -> str:
+ # # converts all metadata and data into JSON
+ # return json.dumps(self.metadata)
+ # #TODO: datetime is not serializable
diff --git a/src/orchestrator.py b/src/orchestrator.py
index 9a523bf..2f33370 100644
--- a/src/orchestrator.py
+++ b/src/orchestrator.py
@@ -153,13 +153,14 @@ class ArchivingOrchestrator:
# these rules are checked in config.py
# assert len(archivers) > 1, "there needs to be at least one Archiver"
- def feed(self) -> list(ArchiveResult):
+ def feed(self) -> list(Metadata):
for url in self.feeder:
print("ARCHIVING", url)
with tempfile.TemporaryDirectory(dir="./") as tmp_dir:
result = self.archive(url, tmp_dir)
+ print(type(result))
print(result)
- print(result.as_json())
+ # print(result.as_json())
print("holding on")
time.sleep(300)
# how does this handle the parameters like folder which can be different for each archiver?
@@ -170,7 +171,7 @@ class ArchivingOrchestrator:
def archive(self, url: str, tmp_dir: str) -> Union[Metadata, None]:
# TODO:
- # url = clear_url(url)
+ # url = clear_url(url) # should we save if they differ?
# result = Metadata(url=url)
result = Metadata()
result.set_url(url)
diff --git a/src/storages/__init__.py b/src/storages/__init__.py
index 99f82b3..96baaba 100644
--- a/src/storages/__init__.py
+++ b/src/storages/__init__.py
@@ -2,4 +2,6 @@
from .base_storage import Storage
from .local_storage import LocalStorage, LocalConfig
from .s3_storage import S3Config, S3Storage
-from .gd_storage import GDConfig, GDStorage
\ No newline at end of file
+from .gd_storage import GDConfig, GDStorage
+
+from .storage import StorageV2
\ No newline at end of file
diff --git a/src/storages/s3.py b/src/storages/s3.py
new file mode 100644
index 0000000..826d66d
--- /dev/null
+++ b/src/storages/s3.py
@@ -0,0 +1,68 @@
+
+from typing import IO
+import boto3, uuid, os, mimetypes
+from botocore.errorfactory import ClientError
+from src.storages import StorageV2
+from loguru import logger
+from slugify import slugify
+
+
+class S3StorageV2(StorageV2):
+ name = "s3_storage"
+
+ def __init__(self, config: dict) -> None:
+ super().__init__(config)
+ self.s3 = boto3.client(
+ 's3',
+ region_name=config.region,
+ endpoint_url=config.endpoint_url.format(region=config.region),
+ aws_access_key_id=config.key,
+ aws_secret_access_key=config.secret
+ )
+
+ @staticmethod
+ def configs() -> dict:
+ return {
+ "bucket": {"default": None, "help": "S3 bucket name"},
+ "region": {"default": None, "help": "S3 region name"},
+ "key": {"default": None, "help": "S3 API key"},
+ "secret": {"default": None, "help": "S3 API secret"},
+ # TODO: how to have sth like a custom folder? has to come from the feeders
+ "endpoint_url": {
+ "default": 'https://{region}.digitaloceanspaces.com',
+ "help": "S3 bucket endpoint, {region} are inserted at runtime"
+ },
+ "cdn_url": {
+ "default": 'https://{bucket}.{region}.cdn.digitaloceanspaces.com/{key}',
+ "help": "S3 CDN url, {bucket}, {region} and {key} are inserted at runtime"
+ },
+ "private": {"default": False, "help": "if true S3 files will not be readable online"},
+ "key_path": {"default": "random", "help": "S3 file names are non-predictable strings, one of ['random', 'default']"},
+ }
+
+ def get_cdn_url(self, key: str) -> str:
+ return self.cdn_url.format(bucket=self.bucket, region=self.region, key=self._get_path(key))
+
+ def uploadf(self, file: IO[bytes], key: str, **kwargs: dict) -> None:
+ extra_args = kwargs.get("extra_args", {})
+ if not self.private and 'ACL' not in extra_args:
+ extra_args['ACL'] = 'public-read'
+
+ if 'ContentType' not in extra_args:
+ try:
+ extra_args['ContentType'] = mimetypes.guess_type(key)[0]
+ except Exception as e:
+ logger.error(f"Unable to get mimetype for {key=}, error: {e}")
+
+ self.s3.upload_fileobj(file, Bucket=self.bucket, Key=self._get_path(key), ExtraArgs=extra_args)
+
+ def exists(self, key: str) -> bool:
+ """
+ Tests if a given file with key=key exists in the bucket
+ """
+ try:
+ self.s3.head_object(Bucket=self.bucket, Key=self._get_path(key))
+ return True
+ except ClientError as e:
+ logger.warning(f"got a ClientError when checking if {key=} exists in bucket={self.bucket}: {e}")
+ return False
diff --git a/src/storages/storage.py b/src/storages/storage.py
new file mode 100644
index 0000000..4052d7e
--- /dev/null
+++ b/src/storages/storage.py
@@ -0,0 +1,21 @@
+from __future__ import annotations
+from abc import abstractmethod
+from dataclasses import dataclass
+from metadata import Metadata
+from steps.step import Step
+
+
+@dataclass
+class StorageV2(Step):
+ name = "storage"
+
+ def __init__(self, config: dict) -> None:
+ # without this STEP.__init__ is not called
+ super().__init__(config)
+
+ # only for typing...
+ def init(name: str, config: dict) -> StorageV2:
+ return Step.init(name, config, StorageV2)
+
+ @abstractmethod
+ def store(self, item: Metadata) -> Metadata: pass
From bb512b36c9c54787c4c5ddd81b6743ae6d03e927 Mon Sep 17 00:00:00 2001
From: msramalho <19508417+msramalho@users.noreply.github.com>
Date: Wed, 4 Jan 2023 16:37:36 +0000
Subject: [PATCH 029/190] gsheet feeder + db WIP
---
orchestration.example.yaml | 4 +-
src/archivers/archiver.py | 4 +
src/configs/v2config.py | 11 ++-
src/databases/__init__.py | 2 +
src/databases/database.py | 27 +++++-
src/databases/gsheet_db.py | 64 ++++++++++++++
src/enrichers/__init__.py | 2 +-
...r_screenshot.py => screenshot_enricher.py} | 0
src/feeders/__init__.py | 2 +-
src/feeders/feeder.py | 6 +-
.../{feeder_gsheet.py => gsheet_feeder.py} | 26 +++---
src/metadata.py | 24 +++++-
src/orchestrator.py | 83 +++++++++++--------
src/steps/step.py | 2 +-
src/storages/__init__.py | 3 +-
15 files changed, 195 insertions(+), 65 deletions(-)
create mode 100644 src/databases/__init__.py
create mode 100644 src/databases/gsheet_db.py
rename src/enrichers/{enricher_screenshot.py => screenshot_enricher.py} (100%)
rename src/feeders/{feeder_gsheet.py => gsheet_feeder.py} (85%)
diff --git a/orchestration.example.yaml b/orchestration.example.yaml
index 7163829..caf7737 100644
--- a/orchestration.example.yaml
+++ b/orchestration.example.yaml
@@ -1,7 +1,7 @@
steps:
# only 1 feeder allowed
# a feeder could be in an "infinite loop" for example: gsheets_infinite feeder which holds-> this could be an easy logic addiction by modifying for each to while not feeder.done() if it becomes necessary
- feeder: gsheets_feeder # default -> only expects URL from CLI
+ feeder: gsheet_feeder # default -> only expects URL from CLI
archivers: # order matters
- telethon
# - tiktok
@@ -28,7 +28,7 @@ steps:
configurations:
global:
- save_logs: False
- gsheets_feeder:
+ gsheet_feeder:
sheet: my-auto-archiver
header: 2 # defaults to 1 in GSheetsFeeder
service_account: "secrets/service_account.json"
diff --git a/src/archivers/archiver.py b/src/archivers/archiver.py
index 37f5d4d..f16464a 100644
--- a/src/archivers/archiver.py
+++ b/src/archivers/archiver.py
@@ -23,6 +23,10 @@ class Archiverv2(Step):
# used when archivers need to login or do other one-time setup
pass
+ def clean_url(self, url:str) -> str:
+ # used to clean unnecessary URL parameters
+ return url
+
def _guess_file_type(self, path: str) -> str:
"""
Receives a URL or filename and returns global mimetype like 'image' or 'video'
diff --git a/src/configs/v2config.py b/src/configs/v2config.py
index 75a125e..b028b5e 100644
--- a/src/configs/v2config.py
+++ b/src/configs/v2config.py
@@ -5,6 +5,8 @@ from dataclasses import dataclass, field
from typing import List
from archivers import Archiverv2
from feeders import Feeder
+from databases import Database
+from storages import StorageV2
from steps.step import Step
from enrichers import Enricher
from collections import defaultdict
@@ -13,10 +15,13 @@ from collections import defaultdict
@dataclass
class ConfigV2:
# TODO: should Config inherit from Step so it can have it's own configurations?
+ # these are only detected if they are put to the respective __init__.py
configurable_parents = [
Feeder,
Enricher,
Archiverv2,
+ Database,
+ StorageV2
# Util
]
feeder: Step # TODO:= BaseFeeder
@@ -24,14 +29,14 @@ class ConfigV2:
enrichers: List[Enricher] = field(default_factory=[])
formatters: List[Step] = field(default_factory=[]) # TODO: fix type
storages: List[Step] = field(default_factory=[]) # TODO: fix type
- databases: List[Step] = field(default_factory=[]) # TODO: fix type
+ databases: List[Database] = field(default_factory=[])
def __init__(self) -> None:
self.defaults = {}
self.cli_ops = {}
self.config = {}
- # TODO: make this work for nested props like gsheets_feeder.columns.url = "URL"
+ # TODO: make this work for nested props like gsheet_feeder.columns.url = "URL"
def parse(self):
# 1. parse CLI values
parser = argparse.ArgumentParser(
@@ -84,10 +89,12 @@ class ConfigV2:
self.feeder = Feeder.init(steps.get("feeder", "cli_feeder"), self.config)
self.enrichers = [Enricher.init(e, self.config) for e in steps.get("enrichers", [])]
self.archivers = [Archiverv2.init(e, self.config) for e in steps.get("archivers", [])]
+ self.databases = [Database.init(e, self.config) for e in steps.get("databases", [])]
print("feeder", self.feeder)
print("enrichers", [e for e in self.enrichers])
print("archivers", [e for e in self.archivers])
+ print("databases", [e for e in self.databases])
def validate(self):
pass
diff --git a/src/databases/__init__.py b/src/databases/__init__.py
new file mode 100644
index 0000000..17b9c6d
--- /dev/null
+++ b/src/databases/__init__.py
@@ -0,0 +1,2 @@
+from .database import Database
+from .gsheet_db import GsheetsDb
\ No newline at end of file
diff --git a/src/databases/database.py b/src/databases/database.py
index 15f8d0d..94b2178 100644
--- a/src/databases/database.py
+++ b/src/databases/database.py
@@ -1,9 +1,11 @@
from __future__ import annotations
from dataclasses import dataclass
from abc import abstractmethod, ABC
+from typing import Union
from metadata import Metadata
from steps.step import Step
+
@dataclass
class Database(Step, ABC):
name = "database"
@@ -11,11 +13,30 @@ class Database(Step, ABC):
def __init__(self, config: dict) -> None:
# without this STEP.__init__ is not called
super().__init__(config)
-
- # only for typing...
def init(name: str, config: dict) -> Database:
+ # only for typing...
return Step.init(name, config, Database)
@abstractmethod
- def enrich(self, item: Metadata) -> Metadata: pass
+ def started(self, item: Metadata) -> None:
+ """signals the DB that the given item archival has started"""
+ pass
+
+ def failed(self, item: Metadata) -> None:
+ """update DB accordingly for failure"""
+ pass
+
+ def aborted(self, item: Metadata) -> None:
+ """abort notification if user cancelled after start"""
+ pass
+
+ # @abstractmethod
+ def fetch(self, item: Metadata) -> Union[Metadata, bool]:
+ """check if the given item has been archived already"""
+ return False
+
+ @abstractmethod
+ def done(self, item: Metadata) -> None:
+ """archival result ready - should be saved to DB"""
+ pass
diff --git a/src/databases/gsheet_db.py b/src/databases/gsheet_db.py
new file mode 100644
index 0000000..939e851
--- /dev/null
+++ b/src/databases/gsheet_db.py
@@ -0,0 +1,64 @@
+from typing import Union, Tuple
+import gspread
+
+# from metadata import Metadata
+from loguru import logger
+
+# from . import Enricher
+from databases import Database
+from metadata import Metadata
+from steps.gsheet import Gsheets
+from utils import GWorksheet
+
+
+class GsheetsDb(Database):
+ """
+ NB: only works if GsheetFeeder is used.
+ could be updated in the future to support non-GsheetFeeder metadata
+ """
+ name = "gsheet_db"
+
+ def __init__(self, config: dict) -> None:
+ # without this STEP.__init__ is not called
+ super().__init__(config)
+
+ @staticmethod
+ def configs() -> dict:
+ return {}
+
+ def started(self, item: Metadata) -> None:
+ logger.warning(f"STARTED {item}")
+ gw, row = self._retrieve_gsheet(item)
+ gw.set_cell(row, 'status', 'Archive in progress')
+
+ def failed(self, item: Metadata) -> None:
+ logger.error(f"FAILED {item}")
+ self._safe_status_update(item, 'Archive failed')
+
+ def aborted(self, item: Metadata) -> None:
+ logger.warning(f"ABORTED {item}")
+ self._safe_status_update(item, '')
+
+ def fetch(self, item: Metadata) -> Union[Metadata, bool]:
+ """check if the given item has been archived already"""
+ # TODO: this should not be done at the feeder stage then!
+ return False
+
+ def done(self, item: Metadata) -> None:
+ """archival result ready - should be saved to DB"""
+ logger.success(f"DONE {item}")
+ gw, row = self._retrieve_gsheet(item)
+ self._safe_status_update(item, 'done')
+ pass
+
+ def _safe_status_update(self, item: Metadata, new_status: str) -> None:
+ try:
+ gw, row = self._retrieve_gsheet(item)
+ gw.set_cell(row, 'status', new_status)
+ except Exception as e:
+ logger.debug(f"Unable to update sheet: {e}")
+
+ def _retrieve_gsheet(self, item: Metadata) -> Tuple[GWorksheet, int]:
+ gw: GWorksheet = item.get("gsheet").get("worksheet")
+ row: int = item.get("gsheet").get("row")
+ return gw, row
diff --git a/src/enrichers/__init__.py b/src/enrichers/__init__.py
index 3c266f8..503ea2c 100644
--- a/src/enrichers/__init__.py
+++ b/src/enrichers/__init__.py
@@ -1,2 +1,2 @@
from .enricher import Enricher
-from .enricher_screenshot import ScreenshotEnricher
\ No newline at end of file
+from .screenshot_enricher import ScreenshotEnricher
\ No newline at end of file
diff --git a/src/enrichers/enricher_screenshot.py b/src/enrichers/screenshot_enricher.py
similarity index 100%
rename from src/enrichers/enricher_screenshot.py
rename to src/enrichers/screenshot_enricher.py
diff --git a/src/feeders/__init__.py b/src/feeders/__init__.py
index 9fb5942..b11cd50 100644
--- a/src/feeders/__init__.py
+++ b/src/feeders/__init__.py
@@ -1,2 +1,2 @@
from.feeder import Feeder
-from .feeder_gsheet import GsheetsFeeder
\ No newline at end of file
+from .gsheet_feeder import GsheetsFeeder
\ No newline at end of file
diff --git a/src/feeders/feeder.py b/src/feeders/feeder.py
index d930ba0..bccfab8 100644
--- a/src/feeders/feeder.py
+++ b/src/feeders/feeder.py
@@ -1,7 +1,7 @@
from __future__ import annotations
from dataclasses import dataclass
from abc import abstractmethod
-# from metadata import Metadata
+from metadata import Metadata
from steps.step import Step
@@ -17,7 +17,5 @@ class Feeder(Step):
# only for code typing
return Step.init(name, config, Feeder)
- # def feed(self, item: Metadata) -> Metadata: pass
-
@abstractmethod
- def __iter__(self) -> Feeder: return None
\ No newline at end of file
+ def __iter__(self) -> Metadata: return None
\ No newline at end of file
diff --git a/src/feeders/feeder_gsheet.py b/src/feeders/gsheet_feeder.py
similarity index 85%
rename from src/feeders/feeder_gsheet.py
rename to src/feeders/gsheet_feeder.py
index ad28af1..b9389a2 100644
--- a/src/feeders/feeder_gsheet.py
+++ b/src/feeders/gsheet_feeder.py
@@ -1,16 +1,17 @@
-import json, gspread
+import gspread
# from metadata import Metadata
from loguru import logger
# from . import Enricher
from feeders import Feeder
+from metadata import Metadata
from steps.gsheet import Gsheets
from utils import GWorksheet
class GsheetsFeeder(Gsheets, Feeder):
- name = "gsheets_feeder"
+ name = "gsheet_feeder"
def __init__(self, config: dict) -> None:
# without this STEP.__init__ is not called
@@ -35,7 +36,7 @@ class GsheetsFeeder(Gsheets, Feeder):
}
})
- def __iter__(self) -> str:
+ def __iter__(self) -> Metadata:
sh = self.gsheets_client.open(self.sheet)
for ii, wks in enumerate(sh.worksheets()):
if not self.should_process_sheet(wks.title):
@@ -52,17 +53,16 @@ class GsheetsFeeder(Gsheets, Feeder):
for row in range(1 + self.header, gw.count_rows() + 1):
url = gw.get_cell(row, 'url').strip()
if not len(url): continue
- # TODO: gsheet_db should check later if this is supposed to be archived
- # static_status = gw.get_cell(row, 'status')
- # status = gw.get_cell(row, 'status', fresh=static_status in ['', None] and url != '')
- # All checks done - archival process starts here
- yield url
- logger.success(f'Finished worksheet {wks.title}')
- # GWorksheet(self.sheet)
- print(self.sheet)
- for u in ["url1", "url2"]:
- yield u
+ original_status = gw.get_cell(row, 'status')
+ status = gw.get_cell(row, 'status', fresh=original_status in ['', None])
+ # TODO: custom status parser(?) aka should_retry_from_status
+ if status not in ['', None]: continue
+
+ # All checks done - archival process starts here
+ yield Metadata().set_url(url).set("gsheet", {"row": row, "worksheet": gw}, True)
+
+ logger.success(f'Finished worksheet {wks.title}')
def should_process_sheet(self, sheet_name: str) -> bool:
if len(self.allow_worksheets) and sheet_name not in self.allow_worksheets:
diff --git a/src/metadata.py b/src/metadata.py
index 8945e1a..90ca743 100644
--- a/src/metadata.py
+++ b/src/metadata.py
@@ -1,6 +1,6 @@
from __future__ import annotations
-from ast import List
+from ast import List, Set
from typing import Any, Union, Dict
from dataclasses import dataclass, field
from datetime import datetime
@@ -12,8 +12,14 @@ from media import Media
@dataclass
class Metadata:
status: str = ""
- metadata: Dict[str, Any] = field(default_factory=dict)
+ metadata: Dict[str, Any] = field(default_factory=dict)
+ tmp_keys: Set[str] = field(default_factory=set) # keys that are not to be saved in DBs
media: List[Media] = field(default_factory=list)
+ rearchivable: bool = False
+
+ # def __init__(self, url, metadata = {}) -> None:
+ # self.set_url(url)
+ # self.metadata = metadata
def merge(self: Metadata, right: Metadata, overwrite_left=True) -> Metadata:
"""
@@ -21,6 +27,7 @@ class Metadata:
"""
if overwrite_left:
self.status = right.status
+ self.rearchivable |= right.rearchivable
for k, v in right.metadata.items():
assert k not in self.metadata or type(v) == type(self.get(k))
if type(v) not in [dict, list, set] or k not in self.metadata:
@@ -33,8 +40,10 @@ class Metadata:
return right.merge(self)
return self
- def set(self, key: str, val: Any) -> Metadata:
+ def set(self, key: str, val: Any, is_tmp=False) -> Metadata:
+ # if not self.metadata: self.metadata = {}
self.metadata[key] = val
+ if is_tmp: self.tmp_keys.add(key)
return self
def get(self, key: str, default: Any = None, create_if_missing=False) -> Union[Metadata, str]:
@@ -75,3 +84,12 @@ class Metadata:
# # converts all metadata and data into JSON
# return json.dumps(self.metadata)
# #TODO: datetime is not serializable
+
+ def cleanup(self) -> Metadata:
+ #TODO: refactor so it returns a JSON with all intended properties, except tmp_keys
+ # the code below leads to errors if database needs tmp_keys after they are removed
+ # """removes temporary metadata fields, ideally called after all ops except writing"""
+ # for tmp_key in self.tmp_keys:
+ # self.metadata.pop(tmp_key, None)
+ # self.tmp_keys = set()
+ pass
diff --git a/src/orchestrator.py b/src/orchestrator.py
index 2f33370..26baed1 100644
--- a/src/orchestrator.py
+++ b/src/orchestrator.py
@@ -5,8 +5,11 @@ from dataclasses import dataclass
from archivers.archiver import Archiverv2
from enrichers.enricher import Enricher
+from databases.database import Database
from metadata import Metadata
-import tempfile, time
+import tempfile, time, traceback
+from loguru import logger
+
"""
how not to couple the different pieces of logic
@@ -119,7 +122,7 @@ class ArchivingOrchestrator:
# identify each formatter, storage, database, etc
# self.feeder = Feeder.init(config.feeder, config.get(config.feeder))
- # Is it possible to overwrite config.yaml values? it could be useful: share config file and modify gsheets_feeder.sheet via CLI
+ # Is it possible to overwrite config.yaml values? it could be useful: share config file and modify gsheet_feeder.sheet via CLI
# where does that update/processing happen? in config.py
# reflection for Archiver to know wihch child classes it has? use Archiver.__subclasses__
# self.archivers = [
@@ -129,12 +132,12 @@ class ArchivingOrchestrator:
self.feeder = config.feeder
self.enrichers = config.enrichers
self.archivers: List[Archiverv2] = config.archivers
+ self.databases: List[Database] = config.databases
for a in self.archivers: a.setup()
self.formatters = []
self.storages = []
- self.databases = []
# self.formatters = [
# Formatter.init(f, config)
# for f in config.formatters
@@ -154,51 +157,61 @@ class ArchivingOrchestrator:
# assert len(archivers) > 1, "there needs to be at least one Archiver"
def feed(self) -> list(Metadata):
- for url in self.feeder:
- print("ARCHIVING", url)
- with tempfile.TemporaryDirectory(dir="./") as tmp_dir:
- result = self.archive(url, tmp_dir)
- print(type(result))
- print(result)
- # print(result.as_json())
- print("holding on")
- time.sleep(300)
+ for item in self.feeder:
+ print("ARCHIVING", item)
+ try:
+ with tempfile.TemporaryDirectory(dir="./") as tmp_dir:
+ item.set("tmp_dir", tmp_dir, True)
+ result = self.archive(item)
+ print(result)
+ except KeyboardInterrupt:
+ # catches keyboard interruptions to do a clean exit
+ logger.warning(f"caught interrupt on {item=}")
+ for d in self.databases: d.aborted(item)
+ exit()
+ except Exception as e:
+ logger.error(f'Got unexpected error on item {item}: {e}\n{traceback.format_exc()}')
+ for d in self.databases: d.failed(item)
+
+ print("holding on 5min")
+ time.sleep(300)
+
# how does this handle the parameters like folder which can be different for each archiver?
# the storage needs to know where to archive!!
# solution: feeders have context: extra metadata that they can read or ignore,
# all of it should have sensible defaults (eg: folder)
# default feeder is a list with 1 element
- def archive(self, url: str, tmp_dir: str) -> Union[Metadata, None]:
- # TODO:
- # url = clear_url(url) # should we save if they differ?
- # result = Metadata(url=url)
- result = Metadata()
+ def archive(self, result: Metadata) -> Union[Metadata, None]:
+ url = result.get_url()
+ # TODO: clean urls
+ for a in self.archivers:
+ url = a.clean_url(url)
result.set_url(url)
- result.set("tmp_dir", tmp_dir)
-
- should_archive = True
- for d in self.databases: should_archive &= d.should_process(url)
+ # should_archive = False
+ # for d in self.databases: should_archive |= d.should_process(url)
# should storages also be able to check?
- for s in self.storages: should_archive &= s.should_process(url)
+ # for s in self.storages: should_archive |= s.should_process(url)
- if not should_archive:
- print("skipping")
- return "skipping"
+ # if not should_archive:
+ # print("skipping")
+ # return "skipping"
# signal to DB that archiving has started
+ # and propagate already archived if it exists
+ cached_result = None
for d in self.databases:
# are the databases to decide whether to archive?
# they can simply return True by default, otherwise they can avoid duplicates. should this logic be more granular, for example on the archiver level: a tweet will not need be scraped twice, whereas an instagram profile might. the archiver could not decide from the link which parts to archive,
# instagram profile example: it would always re-archive everything
# maybe the database/storage could use a hash/key to decide if there's a need to re-archive
- if d.should_process(url):
- d.started(url)
- elif d.exists(url):
- return d.fetch(url)
- else:
- print("Skipping url")
- return
+ d.started(result)
+ if (local_result := d.fetch(result)):
+ cached_result = (cached_result or Metadata()).merge(local_result)
+ if cached_result and not cached_result.rearchivable:
+ for d in self.databases:
+ d.done(cached_result)
+ return cached_result
# vk, telethon, ...
for a in self.archivers:
@@ -209,6 +222,7 @@ class ArchivingOrchestrator:
# this is where the Hashes come from, the place with access to all content
# the archiver does not have access to storage
result.merge(a.download(result))
+ # TODO: fix logic
if True or result.is_success(): break
# what if an archiver returns multiple entries and one is to be part of HTMLgenerator?
@@ -224,13 +238,14 @@ class ArchivingOrchestrator:
for f in self.formatters:
result.merge(f.format(result))
- # storages
+ # storage
for s in self.storages:
for m in result.media:
- m.merge(s.store(m))
+ result.merge(s.store(m))
# signal completion to databases (DBs, Google Sheets, CSV, ...)
# a hash registration service could be one database: forensic archiving
+ result.cleanup()
for d in self.databases: d.done(result)
return result
diff --git a/src/steps/step.py b/src/steps/step.py
index 7a2135c..b512af7 100644
--- a/src/steps/step.py
+++ b/src/steps/step.py
@@ -14,7 +14,7 @@ class Step(ABC):
def __init__(self, config: dict) -> None:
# reads the configs into object properties
# self.config = config[self.name]
- for k, v in config[self.name].items():
+ for k, v in config.get(self.name, {}).items():
self.__setattr__(k, v)
@staticmethod
diff --git a/src/storages/__init__.py b/src/storages/__init__.py
index 96baaba..91ce148 100644
--- a/src/storages/__init__.py
+++ b/src/storages/__init__.py
@@ -4,4 +4,5 @@ from .local_storage import LocalStorage, LocalConfig
from .s3_storage import S3Config, S3Storage
from .gd_storage import GDConfig, GDStorage
-from .storage import StorageV2
\ No newline at end of file
+from .storage import StorageV2
+from .s3 import S3StorageV2
\ No newline at end of file
From 1cdc006b2728d524d6558a8c0215d830632624c7 Mon Sep 17 00:00:00 2001
From: msramalho <19508417+msramalho@users.noreply.github.com>
Date: Wed, 4 Jan 2023 18:02:44 +0000
Subject: [PATCH 030/190] s3 storaging + WIP gsheets DB
---
src/configs/v2config.py | 2 ++
src/databases/gsheet_db.py | 36 +++++++++++++++++++++++++++++++++---
src/feeders/gsheet_feeder.py | 6 +++---
src/media.py | 7 +++----
src/metadata.py | 24 +++++++++++++++++++-----
src/orchestrator.py | 17 ++++++++++-------
src/steps/gsheet.py | 1 +
src/storages/s3.py | 30 ++++++++++++++++--------------
src/storages/storage.py | 24 +++++++++++++++++++++++-
9 files changed, 110 insertions(+), 37 deletions(-)
diff --git a/src/configs/v2config.py b/src/configs/v2config.py
index b028b5e..7260d41 100644
--- a/src/configs/v2config.py
+++ b/src/configs/v2config.py
@@ -90,11 +90,13 @@ class ConfigV2:
self.enrichers = [Enricher.init(e, self.config) for e in steps.get("enrichers", [])]
self.archivers = [Archiverv2.init(e, self.config) for e in steps.get("archivers", [])]
self.databases = [Database.init(e, self.config) for e in steps.get("databases", [])]
+ self.storages = [StorageV2.init(e, self.config) for e in steps.get("storages", [])]
print("feeder", self.feeder)
print("enrichers", [e for e in self.enrichers])
print("archivers", [e for e in self.archivers])
print("databases", [e for e in self.databases])
+ print("storages", [e for e in self.storages])
def validate(self):
pass
diff --git a/src/databases/gsheet_db.py b/src/databases/gsheet_db.py
index 939e851..a5e462f 100644
--- a/src/databases/gsheet_db.py
+++ b/src/databases/gsheet_db.py
@@ -1,5 +1,5 @@
from typing import Union, Tuple
-import gspread
+import gspread, datetime
# from metadata import Metadata
from loguru import logger
@@ -7,6 +7,7 @@ from loguru import logger
# from . import Enricher
from databases import Database
from metadata import Metadata
+from media import Media
from steps.gsheet import Gsheets
from utils import GWorksheet
@@ -48,8 +49,37 @@ class GsheetsDb(Database):
"""archival result ready - should be saved to DB"""
logger.success(f"DONE {item}")
gw, row = self._retrieve_gsheet(item)
- self._safe_status_update(item, 'done')
- pass
+ # self._safe_status_update(item, 'done')
+
+ cell_updates = []
+ row_values = gw.get_row(row)
+
+ def batch_if_valid(col, val, final_value=None):
+ final_value = final_value or val
+ if val and gw.col_exists(col) and gw.get_cell(row_values, col) == '':
+ cell_updates.append((row, col, final_value))
+
+ cell_updates.append((row, 'status', item.status))
+
+ media: Media = item.get_single_media()
+
+ batch_if_valid('archive', media.cdn_url)
+ batch_if_valid('date', True, datetime.datetime.utcnow().replace(tzinfo=datetime.timezone.utc).isoformat())
+ batch_if_valid('title', item.get_title())
+ batch_if_valid('text', item.get("content", "")[:500])
+ batch_if_valid('timestamp', item.get_timestamp())
+
+ # TODO: AFTER ENRICHMENTS
+ # batch_if_valid('hash', media.hash)
+ # batch_if_valid('thumbnail', result.thumbnail, f'=IMAGE("{result.thumbnail}")')
+ # batch_if_valid('thumbnail_index', result.thumbnail_index)
+ # batch_if_valid('duration', result.duration, str(result.duration))
+ # batch_if_valid('screenshot', result.screenshot)
+ # if result.wacz is not None:
+ # batch_if_valid('wacz', result.wacz)
+ # batch_if_valid('replaywebpage', f'https://replayweb.page/?source={quote(result.wacz)}#view=pages&url={quote(url)}')
+
+ gw.batch_set_cell(cell_updates)
def _safe_status_update(self, item: Metadata, new_status: str) -> None:
try:
diff --git a/src/feeders/gsheet_feeder.py b/src/feeders/gsheet_feeder.py
index b9389a2..029813f 100644
--- a/src/feeders/gsheet_feeder.py
+++ b/src/feeders/gsheet_feeder.py
@@ -1,4 +1,4 @@
-import gspread
+import gspread, os
# from metadata import Metadata
from loguru import logger
@@ -8,7 +8,7 @@ from feeders import Feeder
from metadata import Metadata
from steps.gsheet import Gsheets
from utils import GWorksheet
-
+from slugify import slugify
class GsheetsFeeder(Gsheets, Feeder):
name = "gsheet_feeder"
@@ -60,7 +60,7 @@ class GsheetsFeeder(Gsheets, Feeder):
if status not in ['', None]: continue
# All checks done - archival process starts here
- yield Metadata().set_url(url).set("gsheet", {"row": row, "worksheet": gw}, True)
+ yield Metadata().set_url(url).set("gsheet", {"row": row, "worksheet": gw}, True).set("folder", os.path.join(slugify(self.sheet), slugify(wks.title)), True)
logger.success(f'Finished worksheet {wks.title}')
diff --git a/src/media.py b/src/media.py
index ecee4f4..c499b5b 100644
--- a/src/media.py
+++ b/src/media.py
@@ -10,8 +10,7 @@ import json
@dataclass
class Media:
filename: str
- id: str = None
- hash: str = None
+ key: str = None
cdn_url: str = None
- hash: str = None
-
+ # id: str = None
+ # hash: str = None # TODO: added by enrichers
diff --git a/src/metadata.py b/src/metadata.py
index 90ca743..f48c636 100644
--- a/src/metadata.py
+++ b/src/metadata.py
@@ -3,7 +3,7 @@ from __future__ import annotations
from ast import List, Set
from typing import Any, Union, Dict
from dataclasses import dataclass, field
-from datetime import datetime
+import datetime
# import json
from media import Media
@@ -70,26 +70,40 @@ class Metadata:
def set_title(self, title: str) -> Metadata:
return self.set("title", title)
- def set_timestamp(self, timestamp: datetime) -> Metadata:
- assert type(timestamp) == datetime, "set_timestamp expects a datetime instance"
+ def get_title(self) -> str:
+ return self.get("title")
+
+ def set_timestamp(self, timestamp: datetime.datetime) -> Metadata:
+ assert type(timestamp) == datetime.datetime, "set_timestamp expects a datetime instance"
return self.set("timestamp", timestamp)
+ def get_timestamp(self, utc=True, iso=True) -> datetime.datetime:
+ ts = self.get("timestamp")
+ if not ts: return ts
+ if utc: ts = ts.replace(tzinfo=datetime.timezone.utc)
+ if iso: return ts.isoformat()
+ return ts
+
def add_media(self, media: Media) -> Metadata:
# print(f"adding {filename} to {self.metadata.get('media')}")
# return self.set("media", self.get_media() + [filename])
# return self.get_media().append(media)
return self.media.append(media)
+ def get_single_media(self) -> Media:
+ # TODO: check if formatters were applied and choose with priority
+ return self.media[0]
+
# def as_json(self) -> str:
# # converts all metadata and data into JSON
# return json.dumps(self.metadata)
# #TODO: datetime is not serializable
def cleanup(self) -> Metadata:
- #TODO: refactor so it returns a JSON with all intended properties, except tmp_keys
+ # TODO: refactor so it returns a JSON with all intended properties, except tmp_keys
# the code below leads to errors if database needs tmp_keys after they are removed
# """removes temporary metadata fields, ideally called after all ops except writing"""
# for tmp_key in self.tmp_keys:
- # self.metadata.pop(tmp_key, None)
+ # self.metadata.pop(tmp_key, None)
# self.tmp_keys = set()
pass
diff --git a/src/orchestrator.py b/src/orchestrator.py
index 26baed1..3bc5ea7 100644
--- a/src/orchestrator.py
+++ b/src/orchestrator.py
@@ -2,15 +2,18 @@ from __future__ import annotations
from ast import List
from typing import Union, Dict
from dataclasses import dataclass
-from archivers.archiver import Archiverv2
-from enrichers.enricher import Enricher
-from databases.database import Database
+from archivers import Archiverv2
+from storages import StorageV2
+from enrichers import Enricher
+from databases import Database
from metadata import Metadata
+
import tempfile, time, traceback
from loguru import logger
+
"""
how not to couple the different pieces of logic
due to the use of constants for the metadata keys?
@@ -133,11 +136,11 @@ class ArchivingOrchestrator:
self.enrichers = config.enrichers
self.archivers: List[Archiverv2] = config.archivers
self.databases: List[Database] = config.databases
+ self.storages: List[StorageV2] = config.storages
for a in self.archivers: a.setup()
self.formatters = []
- self.storages = []
# self.formatters = [
# Formatter.init(f, config)
# for f in config.formatters
@@ -184,7 +187,7 @@ class ArchivingOrchestrator:
def archive(self, result: Metadata) -> Union[Metadata, None]:
url = result.get_url()
- # TODO: clean urls
+ # TODO: clean urls
for a in self.archivers:
url = a.clean_url(url)
result.set_url(url)
@@ -240,8 +243,8 @@ class ArchivingOrchestrator:
# storage
for s in self.storages:
- for m in result.media:
- result.merge(s.store(m))
+ for i, m in enumerate(result.media):
+ result.media[i] = s.store(m, result)
# signal completion to databases (DBs, Google Sheets, CSV, ...)
# a hash registration service could be one database: forensic archiving
diff --git a/src/steps/gsheet.py b/src/steps/gsheet.py
index 279c036..6bfb5d7 100644
--- a/src/steps/gsheet.py
+++ b/src/steps/gsheet.py
@@ -30,6 +30,7 @@ class Gsheets(Step):
'thumbnail_index': 'thumbnail index',
'timestamp': 'upload timestamp',
'title': 'upload title',
+ 'text': 'text content',
'duration': 'duration',
'screenshot': 'screenshot',
'hash': 'hash',
diff --git a/src/storages/s3.py b/src/storages/s3.py
index 826d66d..d4457e8 100644
--- a/src/storages/s3.py
+++ b/src/storages/s3.py
@@ -1,8 +1,10 @@
-from typing import IO
+from typing import IO, Any
import boto3, uuid, os, mimetypes
from botocore.errorfactory import ClientError
-from src.storages import StorageV2
+from metadata import Metadata
+from media import Media
+from storages import StorageV2
from loguru import logger
from slugify import slugify
@@ -14,10 +16,10 @@ class S3StorageV2(StorageV2):
super().__init__(config)
self.s3 = boto3.client(
's3',
- region_name=config.region,
- endpoint_url=config.endpoint_url.format(region=config.region),
- aws_access_key_id=config.key,
- aws_secret_access_key=config.secret
+ region_name=self.region,
+ endpoint_url=self.endpoint_url.format(region=self.region),
+ aws_access_key_id=self.key,
+ aws_secret_access_key=self.secret
)
@staticmethod
@@ -37,31 +39,31 @@ class S3StorageV2(StorageV2):
"help": "S3 CDN url, {bucket}, {region} and {key} are inserted at runtime"
},
"private": {"default": False, "help": "if true S3 files will not be readable online"},
- "key_path": {"default": "random", "help": "S3 file names are non-predictable strings, one of ['random', 'default']"},
+ # "key_path": {"default": "random", "help": "S3 file names are non-predictable strings, one of ['random', 'default']"},
}
- def get_cdn_url(self, key: str) -> str:
- return self.cdn_url.format(bucket=self.bucket, region=self.region, key=self._get_path(key))
+ def get_cdn_url(self, media: Media) -> str:
+ return self.cdn_url.format(bucket=self.bucket, region=self.region, key=media.key)
- def uploadf(self, file: IO[bytes], key: str, **kwargs: dict) -> None:
+ def uploadf(self, file: IO[bytes], media: Media, **kwargs: dict) -> Any:
extra_args = kwargs.get("extra_args", {})
if not self.private and 'ACL' not in extra_args:
extra_args['ACL'] = 'public-read'
if 'ContentType' not in extra_args:
try:
- extra_args['ContentType'] = mimetypes.guess_type(key)[0]
+ extra_args['ContentType'] = mimetypes.guess_type(media.key)[0]
except Exception as e:
- logger.error(f"Unable to get mimetype for {key=}, error: {e}")
+ logger.warning(f"Unable to get mimetype for {media.key=}, error: {e}")
- self.s3.upload_fileobj(file, Bucket=self.bucket, Key=self._get_path(key), ExtraArgs=extra_args)
+ self.s3.upload_fileobj(file, Bucket=self.bucket, Key=media.key, ExtraArgs=extra_args)
def exists(self, key: str) -> bool:
"""
Tests if a given file with key=key exists in the bucket
"""
try:
- self.s3.head_object(Bucket=self.bucket, Key=self._get_path(key))
+ self.s3.head_object(Bucket=self.bucket, Key=key)
return True
except ClientError as e:
logger.warning(f"got a ClientError when checking if {key=} exists in bucket={self.bucket}: {e}")
diff --git a/src/storages/storage.py b/src/storages/storage.py
index 4052d7e..06346e9 100644
--- a/src/storages/storage.py
+++ b/src/storages/storage.py
@@ -1,8 +1,12 @@
from __future__ import annotations
from abc import abstractmethod
from dataclasses import dataclass
+from typing import IO, Any
+from media import Media
from metadata import Metadata
from steps.step import Step
+from loguru import logger
+import os, uuid
@dataclass
@@ -17,5 +21,23 @@ class StorageV2(Step):
def init(name: str, config: dict) -> StorageV2:
return Step.init(name, config, StorageV2)
+ def store(self, media: Media, item: Metadata) -> Media:
+ media = self.set_key(media, item)
+ self.upload(media)
+ media.cdn_url = self.get_cdn_url(media)
+ return media
+
@abstractmethod
- def store(self, item: Metadata) -> Metadata: pass
+ def uploadf(self, file: IO[bytes], key: str, **kwargs: dict) -> Any: pass
+
+ def upload(self, media: Media, **kwargs) -> Any:
+ logger.debug(f'[{self.__class__.name}] uploading file {media.filename} with key {media.key}')
+ with open(media.filename, 'rb') as f:
+ return self.uploadf(f, media, **kwargs)
+
+ def set_key(self, media: Media, item: Metadata) -> Media:
+ """takes the media and optionally item info and generates a key"""
+ folder = item.get("folder", "")
+ ext = os.path.splitext(media.filename)[1]
+ media.key = os.path.join(folder, f"{str(uuid.uuid4())}{ext}")
+ return media
From aac16fa8c2946e75e63cc861649bcdb675fbc860 Mon Sep 17 00:00:00 2001
From: msramalho <19508417+msramalho@users.noreply.github.com>
Date: Mon, 9 Jan 2023 22:24:44 +0000
Subject: [PATCH 031/190] minor comments
---
src/databases/gsheet_db.py | 1 +
src/steps/step.py | 2 +-
2 files changed, 2 insertions(+), 1 deletion(-)
diff --git a/src/databases/gsheet_db.py b/src/databases/gsheet_db.py
index a5e462f..ba3785a 100644
--- a/src/databases/gsheet_db.py
+++ b/src/databases/gsheet_db.py
@@ -91,4 +91,5 @@ class GsheetsDb(Database):
def _retrieve_gsheet(self, item: Metadata) -> Tuple[GWorksheet, int]:
gw: GWorksheet = item.get("gsheet").get("worksheet")
row: int = item.get("gsheet").get("row")
+ #TODO: to make gsheet_db less coupled with gsheet_feeder's "gsheet" parameter, this method could 1st try to fetch "gsheet" from item and, if missing, manage its own singleton - not needed for now
return gw, row
diff --git a/src/steps/step.py b/src/steps/step.py
index b512af7..a8bad38 100644
--- a/src/steps/step.py
+++ b/src/steps/step.py
@@ -28,4 +28,4 @@ class Step(ABC):
for sub in child.__subclasses__():
if sub.name == name:
return sub(config)
- raise ClassFoundException(f"Unable to initialize STEP with {name=}")
+ raise ClassFoundException(f"Unable to initialize STEP with {name=}, check your configuration file/step names.")
From d4825196f13e8037cbd9005fa4d630d17f42c0ef Mon Sep 17 00:00:00 2001
From: msramalho <19508417+msramalho@users.noreply.github.com>
Date: Tue, 10 Jan 2023 00:22:16 +0000
Subject: [PATCH 032/190] html template working with jinja templates
---
Pipfile | 2 +
Pipfile.lock | 299 ++++++++++----------
src/configs/v2config.py | 9 +-
src/formatters/__init__.py | 2 +
src/formatters/formatter.py | 21 ++
src/formatters/html_formatter.py | 36 +++
src/formatters/templates/html_template.html | 101 +++++++
src/media.py | 9 +-
src/metadata.py | 26 +-
src/orchestrator.py | 27 +-
src/steps/gsheet.py | 1 +
11 files changed, 369 insertions(+), 164 deletions(-)
create mode 100644 src/formatters/__init__.py
create mode 100644 src/formatters/formatter.py
create mode 100644 src/formatters/html_formatter.py
create mode 100644 src/formatters/templates/html_template.html
diff --git a/Pipfile b/Pipfile
index 2095f2b..d79388d 100644
--- a/Pipfile
+++ b/Pipfile
@@ -27,6 +27,8 @@ vk-url-scraper = "*"
python-twitter-v2 = "*"
instaloader = "*"
tqdm = "*"
+jinja2 = "*"
+cryptography = "==38.0.4"
[requires]
python_version = "3.9"
diff --git a/Pipfile.lock b/Pipfile.lock
index 5bfeba7..83e2607 100644
--- a/Pipfile.lock
+++ b/Pipfile.lock
@@ -1,7 +1,7 @@
{
"_meta": {
"hash": {
- "sha256": "60b8f39d7a466e194c98a3fb6a03f74f03b108f5fac4cce8657c5ffdf6a02962"
+ "sha256": "bcc36e9ecdf6d383a1010629484eec271699ac23b40be045d9a9669b4c9fac8c"
},
"pipfile-spec": 6,
"requires": {
@@ -34,11 +34,11 @@
},
"attrs": {
"hashes": [
- "sha256:29adc2665447e5191d0e7c568fde78b21f9672d344281d0c6e1ab085429b22b6",
- "sha256:86efa402f67bf2df34f51a335487cf46b1ec130d02b8d39fd248abfd30da551c"
+ "sha256:29e95c7f6778868dbd49170f98f8818f78f3dc5e0e37c0b1f474e3561b240836",
+ "sha256:c9227bfc2f01993c03f68db37d1d15c9690188323c067c641f1a35ca58185f99"
],
- "markers": "python_version >= '3.5'",
- "version": "==22.1.0"
+ "markers": "python_version >= '3.6'",
+ "version": "==22.2.0"
},
"authlib": {
"hashes": [
@@ -57,19 +57,19 @@
},
"boto3": {
"hashes": [
- "sha256:53badfc5f145b8a3f9117512b41bc5a64db1cce1b549061d8edba68909e63fdf",
- "sha256:548081a0f8854bb2eea1e368ab29945478105f56989546f653c75528dcb07d88"
+ "sha256:96055651f7be882175aa334ad46528e1ad79fb8ca33fa9c3998cc1d985b34eab",
+ "sha256:e24d65c31780c208768ebcd152d8a0181591c9c8e7d971e23f318d7f41910ba1"
],
"index": "pypi",
- "version": "==1.26.28"
+ "version": "==1.26.46"
},
"botocore": {
"hashes": [
- "sha256:982732e7ed65cb6ed11ea3ce0e32dff2bcd465836c32376154f0802aa0a112c7",
- "sha256:f0b8bb976e368dea20a960b47169e31fc0828feb6f0b9f59f1e5be8d08919b10"
+ "sha256:78bf25933e35eb6354a9e80fe156f86dce4d346a92afe364dfce25c17ab0639f",
+ "sha256:dbac2fde265f13beb9191ec3ff63b90b515e9ed63875edc3afbd72c5f585e48b"
],
"markers": "python_version >= '3.7'",
- "version": "==1.29.28"
+ "version": "==1.29.46"
},
"brotli": {
"hashes": [
@@ -168,11 +168,11 @@
},
"cachetools": {
"hashes": [
- "sha256:6a94c6402995a99c3970cc7e4884bb60b4a8639938157eeed436098bf9831757",
- "sha256:f9f17d2aec496a9aa6b76f53e3b614c965223c061982d434d160f930c698a9db"
+ "sha256:5991bc0e08a1319bb618d3195ca5b6bc76646a49c21d55962977197b301cc1fe",
+ "sha256:8462eebf3a6c15d25430a8c27c56ac61340b2ecf60c9ce57afc2b97e450e47da"
],
"markers": "python_version ~= '3.7'",
- "version": "==5.2.0"
+ "version": "==5.2.1"
},
"certifi": {
"hashes": [
@@ -269,10 +269,10 @@
},
"cloudscraper": {
"hashes": [
- "sha256:5f0cde23774270e8a092de68e0fbd68e17854c767fc2d4042a91bda9e4816871",
- "sha256:ec30da6cee60d0a95e898d9b3aaf09291a0d8b6cf751e86c6f3420b699a00091"
+ "sha256:2776c70f3661c028e59fd306ac2b104882c9b3cb3f798086251e00fc2d72c3a2",
+ "sha256:3b9753724616ac4d811e7922ddc9dba9b4419749ebaa35b0ba503d442522df2e"
],
- "version": "==1.2.66"
+ "version": "==1.2.67"
},
"commonmark": {
"hashes": [
@@ -310,7 +310,7 @@
"sha256:ca57eb3ddaccd1112c18fc80abe41db443cc2e9dcb1917078e02dfa010a4f353",
"sha256:ce127dd0a6a0811c251a6cddd014d292728484e530d80e872ad9806cfb1c5b3c"
],
- "markers": "python_version >= '3.6'",
+ "index": "pypi",
"version": "==38.0.4"
},
"dataclasses-json": {
@@ -323,19 +323,19 @@
},
"dateparser": {
"hashes": [
- "sha256:4431159799b63d8acec5d7d844c5e06edf3d1b0eb2bda6d4cac87134ddddd01c",
- "sha256:73ec6e44a133c54076ecf9f9dc0fbe3dd4831f154f977ff06f53114d57c5425e"
+ "sha256:107f3cc87a60770e10d111349adc1504224a6b60753a47a64b0ec842ab85b5a9",
+ "sha256:ceb159f1b4a9df54ed6209e91298097deafde476037f8611b4cb2b1cb8b31c58"
],
"index": "pypi",
- "version": "==1.1.4"
+ "version": "==1.1.5"
},
"exceptiongroup": {
"hashes": [
- "sha256:542adf9dea4055530d6e1279602fa5cb11dab2395fa650b8674eaec35fc4a828",
- "sha256:bd14967b79cd9bdb54d97323216f8fdf533e278df937aa2a90089e7d6e06e5ec"
+ "sha256:327cbda3da756e2de031a3107b81ab7b3770a602c4d16ca618298c526f4bec1e",
+ "sha256:bcb67d800a4497e1b404c2dd44fca47d3b7a5e5433dbab67f96c1a685cdfdf23"
],
"markers": "python_version < '3.11'",
- "version": "==1.0.4"
+ "version": "==1.1.0"
},
"ffmpeg-python": {
"hashes": [
@@ -347,11 +347,11 @@
},
"filelock": {
"hashes": [
- "sha256:7565f628ea56bfcd8e54e42bdc55da899c85c1abfe1b5bcfd147e9188cebb3b2",
- "sha256:8df285554452285f79c035efb0c861eb33a4bcfa5b7a137016e32e6a90f9792c"
+ "sha256:7b319f24340b51f55a2bf7a12ac0755a9b03e718311dac567a0f4f7fabd2f5de",
+ "sha256:f58d535af89bb9ad5cd4df046f741f8553a418c01a7856bf0d173bbc9f6bd16d"
],
"markers": "python_version >= '3.7'",
- "version": "==3.8.2"
+ "version": "==3.9.0"
},
"flask": {
"hashes": [
@@ -378,19 +378,19 @@
},
"google-api-python-client": {
"hashes": [
- "sha256:03624a28b5ba94f3c3d44761081f5dbf8cabaa20c5c3a96c046457c5713efb9b",
- "sha256:bc2447a7479006d98927fb20faa74d892d3758ff68e99b621367632bc42b8af8"
+ "sha256:9412ad3445518fa9d24d02c673a70b07c9d124990f44763cdf4f5304ca5b4d08",
+ "sha256:a4ea351db2bb2a9b1a7e96d8fa8de0fcbc31d9e237b724f4a07b243c2d63e9a4"
],
"index": "pypi",
- "version": "==2.69.0"
+ "version": "==2.71.0"
},
"google-auth": {
"hashes": [
- "sha256:6897b93556d8d807ad70701bb89f000183aea366ca7ed94680828b37437a4994",
- "sha256:72f12a6cfc968d754d7bdab369c5c5c16032106e52d32c6dfd8484e4c01a6d1f"
+ "sha256:5045648c821fb72384cdc0e82cc326df195f113a33049d9b62b74589243d2acc",
+ "sha256:ed7057a101af1146f0554a769930ac9de506aeca4fd5af6543ebe791851a9fbd"
],
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4, 3.5'",
- "version": "==2.15.0"
+ "version": "==2.16.0"
},
"google-auth-httplib2": {
"hashes": [
@@ -410,11 +410,11 @@
},
"googleapis-common-protos": {
"hashes": [
- "sha256:27a849d6205838fb6cc3c1c21cb9800707a661bb21c6ce7fb13e99eb1f8a0c46",
- "sha256:a9f4a1d7f6d9809657b7f1316a1aa527f6664891531bcfcc13b6696e685f443c"
+ "sha256:c727251ec025947d545184ba17e3578840fc3a24a0516a020479edab660457df",
+ "sha256:ca3befcd4580dab6ad49356b46bf165bb68ff4b32389f028f1abd7c10ab9519a"
],
"markers": "python_version >= '3.7'",
- "version": "==1.57.0"
+ "version": "==1.58.0"
},
"gspread": {
"hashes": [
@@ -468,7 +468,7 @@
"sha256:31351a702a408a9e7595a8fc6150fc3f43bb6bf7e319770cbc0db9df9437e852",
"sha256:6088930bfe239f0e6710546ab9c19c9ef35e29792895fed6e6e31a023a182a61"
],
- "markers": "python_version >= '3.7'",
+ "index": "pypi",
"version": "==3.1.2"
},
"jmespath": {
@@ -489,79 +489,86 @@
},
"lxml": {
"hashes": [
- "sha256:04da965dfebb5dac2619cb90fcf93efdb35b3c6994fea58a157a834f2f94b318",
- "sha256:0538747a9d7827ce3e16a8fdd201a99e661c7dee3c96c885d8ecba3c35d1032c",
- "sha256:0645e934e940107e2fdbe7c5b6fb8ec6232444260752598bc4d09511bd056c0b",
- "sha256:079b68f197c796e42aa80b1f739f058dcee796dc725cc9a1be0cdb08fc45b000",
- "sha256:0f3f0059891d3254c7b5fb935330d6db38d6519ecd238ca4fce93c234b4a0f73",
- "sha256:10d2017f9150248563bb579cd0d07c61c58da85c922b780060dcc9a3aa9f432d",
- "sha256:1355755b62c28950f9ce123c7a41460ed9743c699905cbe664a5bcc5c9c7c7fb",
- "sha256:13c90064b224e10c14dcdf8086688d3f0e612db53766e7478d7754703295c7c8",
- "sha256:1423631e3d51008871299525b541413c9b6c6423593e89f9c4cfbe8460afc0a2",
- "sha256:1436cf0063bba7888e43f1ba8d58824f085410ea2025befe81150aceb123e345",
- "sha256:1a7c59c6ffd6ef5db362b798f350e24ab2cfa5700d53ac6681918f314a4d3b94",
- "sha256:1e1cf47774373777936c5aabad489fef7b1c087dcd1f426b621fda9dcc12994e",
- "sha256:206a51077773c6c5d2ce1991327cda719063a47adc02bd703c56a662cdb6c58b",
- "sha256:21fb3d24ab430fc538a96e9fbb9b150029914805d551deeac7d7822f64631dfc",
- "sha256:27e590352c76156f50f538dbcebd1925317a0f70540f7dc8c97d2931c595783a",
- "sha256:287605bede6bd36e930577c5925fcea17cb30453d96a7b4c63c14a257118dbb9",
- "sha256:2aaf6a0a6465d39b5ca69688fce82d20088c1838534982996ec46633dc7ad6cc",
- "sha256:32a73c53783becdb7eaf75a2a1525ea8e49379fb7248c3eeefb9412123536387",
- "sha256:41fb58868b816c202e8881fd0f179a4644ce6e7cbbb248ef0283a34b73ec73bb",
- "sha256:4780677767dd52b99f0af1f123bc2c22873d30b474aa0e2fc3fe5e02217687c7",
- "sha256:4878e667ebabe9b65e785ac8da4d48886fe81193a84bbe49f12acff8f7a383a4",
- "sha256:487c8e61d7acc50b8be82bda8c8d21d20e133c3cbf41bd8ad7eb1aaeb3f07c97",
- "sha256:4beea0f31491bc086991b97517b9683e5cfb369205dac0148ef685ac12a20a67",
- "sha256:4cfbe42c686f33944e12f45a27d25a492cc0e43e1dc1da5d6a87cbcaf2e95627",
- "sha256:4d5bae0a37af799207140652a700f21a85946f107a199bcb06720b13a4f1f0b7",
- "sha256:4e285b5f2bf321fc0857b491b5028c5f276ec0c873b985d58d7748ece1d770dd",
- "sha256:57e4d637258703d14171b54203fd6822fda218c6c2658a7d30816b10995f29f3",
- "sha256:5974895115737a74a00b321e339b9c3f45c20275d226398ae79ac008d908bff7",
- "sha256:5ef87fca280fb15342726bd5f980f6faf8b84a5287fcc2d4962ea8af88b35130",
- "sha256:603a464c2e67d8a546ddaa206d98e3246e5db05594b97db844c2f0a1af37cf5b",
- "sha256:6653071f4f9bac46fbc30f3c7838b0e9063ee335908c5d61fb7a4a86c8fd2036",
- "sha256:6ca2264f341dd81e41f3fffecec6e446aa2121e0b8d026fb5130e02de1402785",
- "sha256:6d279033bf614953c3fc4a0aa9ac33a21e8044ca72d4fa8b9273fe75359d5cca",
- "sha256:6d949f53ad4fc7cf02c44d6678e7ff05ec5f5552b235b9e136bd52e9bf730b91",
- "sha256:6daa662aba22ef3258934105be2dd9afa5bb45748f4f702a3b39a5bf53a1f4dc",
- "sha256:6eafc048ea3f1b3c136c71a86db393be36b5b3d9c87b1c25204e7d397cee9536",
- "sha256:830c88747dce8a3e7525defa68afd742b4580df6aa2fdd6f0855481e3994d391",
- "sha256:86e92728ef3fc842c50a5cb1d5ba2bc66db7da08a7af53fb3da79e202d1b2cd3",
- "sha256:8caf4d16b31961e964c62194ea3e26a0e9561cdf72eecb1781458b67ec83423d",
- "sha256:8d1a92d8e90b286d491e5626af53afef2ba04da33e82e30744795c71880eaa21",
- "sha256:8f0a4d179c9a941eb80c3a63cdb495e539e064f8054230844dcf2fcb812b71d3",
- "sha256:9232b09f5efee6a495a99ae6824881940d6447debe272ea400c02e3b68aad85d",
- "sha256:927a9dd016d6033bc12e0bf5dee1dde140235fc8d0d51099353c76081c03dc29",
- "sha256:93e414e3206779ef41e5ff2448067213febf260ba747fc65389a3ddaa3fb8715",
- "sha256:98cafc618614d72b02185ac583c6f7796202062c41d2eeecdf07820bad3295ed",
- "sha256:9c3a88d20e4fe4a2a4a84bf439a5ac9c9aba400b85244c63a1ab7088f85d9d25",
- "sha256:9f36de4cd0c262dd9927886cc2305aa3f2210db437aa4fed3fb4940b8bf4592c",
- "sha256:a60f90bba4c37962cbf210f0188ecca87daafdf60271f4c6948606e4dabf8785",
- "sha256:a614e4afed58c14254e67862456d212c4dcceebab2eaa44d627c2ca04bf86837",
- "sha256:ae06c1e4bc60ee076292e582a7512f304abdf6c70db59b56745cca1684f875a4",
- "sha256:b122a188cd292c4d2fcd78d04f863b789ef43aa129b233d7c9004de08693728b",
- "sha256:b570da8cd0012f4af9fa76a5635cd31f707473e65a5a335b186069d5c7121ff2",
- "sha256:bcaa1c495ce623966d9fc8a187da80082334236a2a1c7e141763ffaf7a405067",
- "sha256:bd34f6d1810d9354dc7e35158aa6cc33456be7706df4420819af6ed966e85448",
- "sha256:be9eb06489bc975c38706902cbc6888f39e946b81383abc2838d186f0e8b6a9d",
- "sha256:c4b2e0559b68455c085fb0f6178e9752c4be3bba104d6e881eb5573b399d1eb2",
- "sha256:c62e8dd9754b7debda0c5ba59d34509c4688f853588d75b53c3791983faa96fc",
- "sha256:c852b1530083a620cb0de5f3cd6826f19862bafeaf77586f1aef326e49d95f0c",
- "sha256:d9fc0bf3ff86c17348dfc5d322f627d78273eba545db865c3cd14b3f19e57fa5",
- "sha256:dad7b164905d3e534883281c050180afcf1e230c3d4a54e8038aa5cfcf312b84",
- "sha256:e5f66bdf0976ec667fc4594d2812a00b07ed14d1b44259d19a41ae3fff99f2b8",
- "sha256:e8f0c9d65da595cfe91713bc1222af9ecabd37971762cb830dea2fc3b3bb2acf",
- "sha256:edffbe3c510d8f4bf8640e02ca019e48a9b72357318383ca60e3330c23aaffc7",
- "sha256:eea5d6443b093e1545ad0210e6cf27f920482bfcf5c77cdc8596aec73523bb7e",
- "sha256:ef72013e20dd5ba86a8ae1aed7f56f31d3374189aa8b433e7b12ad182c0d2dfb",
- "sha256:f05251bbc2145349b8d0b77c0d4e5f3b228418807b1ee27cefb11f69ed3d233b",
- "sha256:f1be258c4d3dc609e654a1dc59d37b17d7fef05df912c01fc2e15eb43a9735f3",
- "sha256:f9ced82717c7ec65a67667bb05865ffe38af0e835cdd78728f1209c8fffe0cad",
- "sha256:fe17d10b97fdf58155f858606bddb4e037b805a60ae023c009f760d8361a4eb8",
- "sha256:fe749b052bb7233fe5d072fcb549221a8cb1a16725c47c37e42b0b9cb3ff2c3f"
+ "sha256:01d36c05f4afb8f7c20fd9ed5badca32a2029b93b1750f571ccc0b142531caf7",
+ "sha256:04876580c050a8c5341d706dd464ff04fd597095cc8c023252566a8826505726",
+ "sha256:05ca3f6abf5cf78fe053da9b1166e062ade3fa5d4f92b4ed688127ea7d7b1d03",
+ "sha256:090c6543d3696cbe15b4ac6e175e576bcc3f1ccfbba970061b7300b0c15a2140",
+ "sha256:0dc313ef231edf866912e9d8f5a042ddab56c752619e92dfd3a2c277e6a7299a",
+ "sha256:0f2b1e0d79180f344ff9f321327b005ca043a50ece8713de61d1cb383fb8ac05",
+ "sha256:13598ecfbd2e86ea7ae45ec28a2a54fb87ee9b9fdb0f6d343297d8e548392c03",
+ "sha256:16efd54337136e8cd72fb9485c368d91d77a47ee2d42b057564aae201257d419",
+ "sha256:1ab8f1f932e8f82355e75dda5413a57612c6ea448069d4fb2e217e9a4bed13d4",
+ "sha256:223f4232855ade399bd409331e6ca70fb5578efef22cf4069a6090acc0f53c0e",
+ "sha256:2455cfaeb7ac70338b3257f41e21f0724f4b5b0c0e7702da67ee6c3640835b67",
+ "sha256:2899456259589aa38bfb018c364d6ae7b53c5c22d8e27d0ec7609c2a1ff78b50",
+ "sha256:2a29ba94d065945944016b6b74e538bdb1751a1db6ffb80c9d3c2e40d6fa9894",
+ "sha256:2a87fa548561d2f4643c99cd13131acb607ddabb70682dcf1dff5f71f781a4bf",
+ "sha256:2e430cd2824f05f2d4f687701144556646bae8f249fd60aa1e4c768ba7018947",
+ "sha256:36c3c175d34652a35475a73762b545f4527aec044910a651d2bf50de9c3352b1",
+ "sha256:3818b8e2c4b5148567e1b09ce739006acfaa44ce3156f8cbbc11062994b8e8dd",
+ "sha256:3ab9fa9d6dc2a7f29d7affdf3edebf6ece6fb28a6d80b14c3b2fb9d39b9322c3",
+ "sha256:3efea981d956a6f7173b4659849f55081867cf897e719f57383698af6f618a92",
+ "sha256:4c8f293f14abc8fd3e8e01c5bd86e6ed0b6ef71936ded5bf10fe7a5efefbaca3",
+ "sha256:5344a43228767f53a9df6e5b253f8cdca7dfc7b7aeae52551958192f56d98457",
+ "sha256:58bfa3aa19ca4c0f28c5dde0ff56c520fbac6f0daf4fac66ed4c8d2fb7f22e74",
+ "sha256:5b4545b8a40478183ac06c073e81a5ce4cf01bf1734962577cf2bb569a5b3bbf",
+ "sha256:5f50a1c177e2fa3ee0667a5ab79fdc6b23086bc8b589d90b93b4bd17eb0e64d1",
+ "sha256:63da2ccc0857c311d764e7d3d90f429c252e83b52d1f8f1d1fe55be26827d1f4",
+ "sha256:6749649eecd6a9871cae297bffa4ee76f90b4504a2a2ab528d9ebe912b101975",
+ "sha256:6804daeb7ef69e7b36f76caddb85cccd63d0c56dedb47555d2fc969e2af6a1a5",
+ "sha256:689bb688a1db722485e4610a503e3e9210dcc20c520b45ac8f7533c837be76fe",
+ "sha256:699a9af7dffaf67deeae27b2112aa06b41c370d5e7633e0ee0aea2e0b6c211f7",
+ "sha256:6b418afe5df18233fc6b6093deb82a32895b6bb0b1155c2cdb05203f583053f1",
+ "sha256:76cf573e5a365e790396a5cc2b909812633409306c6531a6877c59061e42c4f2",
+ "sha256:7b515674acfdcadb0eb5d00d8a709868173acece5cb0be3dd165950cbfdf5409",
+ "sha256:7b770ed79542ed52c519119473898198761d78beb24b107acf3ad65deae61f1f",
+ "sha256:7d2278d59425777cfcb19735018d897ca8303abe67cc735f9f97177ceff8027f",
+ "sha256:7e91ee82f4199af8c43d8158024cbdff3d931df350252288f0d4ce656df7f3b5",
+ "sha256:821b7f59b99551c69c85a6039c65b75f5683bdc63270fec660f75da67469ca24",
+ "sha256:822068f85e12a6e292803e112ab876bc03ed1f03dddb80154c395f891ca6b31e",
+ "sha256:8340225bd5e7a701c0fa98284c849c9b9fc9238abf53a0ebd90900f25d39a4e4",
+ "sha256:85cabf64adec449132e55616e7ca3e1000ab449d1d0f9d7f83146ed5bdcb6d8a",
+ "sha256:880bbbcbe2fca64e2f4d8e04db47bcdf504936fa2b33933efd945e1b429bea8c",
+ "sha256:8d0b4612b66ff5d62d03bcaa043bb018f74dfea51184e53f067e6fdcba4bd8de",
+ "sha256:8e20cb5a47247e383cf4ff523205060991021233ebd6f924bca927fcf25cf86f",
+ "sha256:925073b2fe14ab9b87e73f9a5fde6ce6392da430f3004d8b72cc86f746f5163b",
+ "sha256:998c7c41910666d2976928c38ea96a70d1aa43be6fe502f21a651e17483a43c5",
+ "sha256:9b22c5c66f67ae00c0199f6055705bc3eb3fcb08d03d2ec4059a2b1b25ed48d7",
+ "sha256:9f102706d0ca011de571de32c3247c6476b55bb6bc65a20f682f000b07a4852a",
+ "sha256:a08cff61517ee26cb56f1e949cca38caabe9ea9fbb4b1e10a805dc39844b7d5c",
+ "sha256:a0a336d6d3e8b234a3aae3c674873d8f0e720b76bc1d9416866c41cd9500ffb9",
+ "sha256:a35f8b7fa99f90dd2f5dc5a9fa12332642f087a7641289ca6c40d6e1a2637d8e",
+ "sha256:a38486985ca49cfa574a507e7a2215c0c780fd1778bb6290c21193b7211702ab",
+ "sha256:a5da296eb617d18e497bcf0a5c528f5d3b18dadb3619fbdadf4ed2356ef8d941",
+ "sha256:a6e441a86553c310258aca15d1c05903aaf4965b23f3bc2d55f200804e005ee5",
+ "sha256:a82d05da00a58b8e4c0008edbc8a4b6ec5a4bc1e2ee0fb6ed157cf634ed7fa45",
+ "sha256:ab323679b8b3030000f2be63e22cdeea5b47ee0abd2d6a1dc0c8103ddaa56cd7",
+ "sha256:b1f42b6921d0e81b1bcb5e395bc091a70f41c4d4e55ba99c6da2b31626c44892",
+ "sha256:b23e19989c355ca854276178a0463951a653309fb8e57ce674497f2d9f208746",
+ "sha256:b264171e3143d842ded311b7dccd46ff9ef34247129ff5bf5066123c55c2431c",
+ "sha256:b26a29f0b7fc6f0897f043ca366142d2b609dc60756ee6e4e90b5f762c6adc53",
+ "sha256:b64d891da92e232c36976c80ed7ebb383e3f148489796d8d31a5b6a677825efe",
+ "sha256:b9cc34af337a97d470040f99ba4282f6e6bac88407d021688a5d585e44a23184",
+ "sha256:bc718cd47b765e790eecb74d044cc8d37d58562f6c314ee9484df26276d36a38",
+ "sha256:be7292c55101e22f2a3d4d8913944cbea71eea90792bf914add27454a13905df",
+ "sha256:c83203addf554215463b59f6399835201999b5e48019dc17f182ed5ad87205c9",
+ "sha256:c9ec3eaf616d67db0764b3bb983962b4f385a1f08304fd30c7283954e6a7869b",
+ "sha256:ca34efc80a29351897e18888c71c6aca4a359247c87e0b1c7ada14f0ab0c0fb2",
+ "sha256:ca989b91cf3a3ba28930a9fc1e9aeafc2a395448641df1f387a2d394638943b0",
+ "sha256:d02a5399126a53492415d4906ab0ad0375a5456cc05c3fc0fc4ca11771745cda",
+ "sha256:d17bc7c2ccf49c478c5bdd447594e82692c74222698cfc9b5daae7ae7e90743b",
+ "sha256:d5bf6545cd27aaa8a13033ce56354ed9e25ab0e4ac3b5392b763d8d04b08e0c5",
+ "sha256:d6b430a9938a5a5d85fc107d852262ddcd48602c120e3dbb02137c83d212b380",
+ "sha256:da248f93f0418a9e9d94b0080d7ebc407a9a5e6d0b57bb30db9b5cc28de1ad33",
+ "sha256:da4dd7c9c50c059aba52b3524f84d7de956f7fef88f0bafcf4ad7dde94a064e8",
+ "sha256:df0623dcf9668ad0445e0558a21211d4e9a149ea8f5666917c8eeec515f0a6d1",
+ "sha256:e5168986b90a8d1f2f9dc1b841467c74221bd752537b99761a93d2d981e04889",
+ "sha256:efa29c2fe6b4fdd32e8ef81c1528506895eca86e1d8c4657fda04c9b3786ddf9",
+ "sha256:f1496ea22ca2c830cbcbd473de8f114a320da308438ae65abad6bab7867fe38f",
+ "sha256:f49e52d174375a7def9915c9f06ec4e569d235ad428f70751765f48d5926678c"
],
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'",
- "version": "==4.9.1"
+ "version": "==4.9.2"
},
"markupsafe": {
"hashes": [
@@ -665,31 +672,31 @@
},
"packaging": {
"hashes": [
- "sha256:2198ec20bd4c017b8f9717e00f0c8714076fc2fd93816750ab48e2c41de2cfd3",
- "sha256:957e2148ba0e1a3b282772e791ef1d8083648bc131c8ab0c1feba110ce1146c3"
+ "sha256:714ac14496c3e68c99c29b00845f7a2b85f3bb6f1078fd9f72fd20f0570002b2",
+ "sha256:b6ad297f8907de0fa2fe1ccbd26fdaf387f5f47c7275fedf8cce89f99446cf97"
],
"markers": "python_version >= '3.7'",
- "version": "==22.0"
+ "version": "==23.0"
},
"protobuf": {
"hashes": [
- "sha256:25266bf373ee06d5d66f9eb1ec9d434b243dccce5c32faf151054cfa6f9dcbf1",
- "sha256:260e346927fd4e6fbb49ab545137b19610c24a1d853dc5f29ddf777ab1987211",
- "sha256:2c6a4d13732d9b094db31b3841986c38b17ac61a3fe05ee26a779d94c4c3fb43",
- "sha256:4922e3320ed70e81f05060822da36923d09fd9e04e17f411f2d8d8d0070f9f5c",
- "sha256:4b75c947289a2e9c1f37d21c593f1ef6fb4fed33977dfb2ac84f799eb29a8ff4",
- "sha256:4d01ef83517c181d60ea1c6d0b2f644be250ade740d6554a2f5a021b1ad622e3",
- "sha256:553e35c0878f6855e55f01a14561e6bce6df79b6636a5acf83b9d9ac7eab7922",
- "sha256:85ccb4753ee21de7dc81a7a68a051f25dbe133ffa01a639ac998427d0b223387",
- "sha256:a5a14b907a191319e7a58b38c583bbf50deb21e002f723a912c5e4f6969a778e",
- "sha256:a944dc9550baae276afc7dc8193191d4c2ad660270a1e5ed5a71539817ebe2e2",
- "sha256:bab4b21a986ded225b9392c07ce21c35d790951f51e1ebfd32e4d443b05c3726",
- "sha256:c3b9e329b4c247dc3ba5c50f60915a84e08278eb6d9e3fa674d0d04ff816bfd7",
- "sha256:d91a47c77b33580024b0271b65bb820c4e0264c25eb49151ad01e691de8fa0b6",
- "sha256:efb16b16fd3eef25357f84d516062753014b76279ce4e0ec4880badd2fba7370"
+ "sha256:1f22ac0ca65bb70a876060d96d914dae09ac98d114294f77584b0d2644fa9c30",
+ "sha256:237216c3326d46808a9f7c26fd1bd4b20015fb6867dc5d263a493ef9a539293b",
+ "sha256:27f4d15021da6d2b706ddc3860fac0a5ddaba34ab679dc182b60a8bb4e1121cc",
+ "sha256:299ea899484ee6f44604deb71f424234f654606b983cb496ea2a53e3c63ab791",
+ "sha256:3d164928ff0727d97022957c2b849250ca0e64777ee31efd7d6de2e07c494717",
+ "sha256:6ab80df09e3208f742c98443b6166bcb70d65f52cfeb67357d52032ea1ae9bec",
+ "sha256:78a28c9fa223998472886c77042e9b9afb6fe4242bd2a2a5aced88e3f4422aa7",
+ "sha256:7cd532c4566d0e6feafecc1059d04c7915aec8e182d1cf7adee8b24ef1e2e6ab",
+ "sha256:89f9149e4a0169cddfc44c74f230d7743002e3aa0b9472d8c28f0388102fc4c2",
+ "sha256:a53fd3f03e578553623272dc46ac2f189de23862e68565e83dde203d41b76fc5",
+ "sha256:b135410244ebe777db80298297a97fbb4c862c881b4403b71bac9d4107d61fd1",
+ "sha256:b98d0148f84e3a3c569e19f52103ca1feacdac0d2df8d6533cf983d1fda28462",
+ "sha256:d1736130bce8cf131ac7957fa26880ca19227d4ad68b4888b3be0dea1f95df97",
+ "sha256:f45460f9ee70a0ec1b6694c6e4e348ad2019275680bd68a1d9314b8c7e01e574"
],
"markers": "python_version >= '3.7'",
- "version": "==4.21.11"
+ "version": "==4.21.12"
},
"pyaes": {
"hashes": [
@@ -774,11 +781,11 @@
},
"pygments": {
"hashes": [
- "sha256:56a8508ae95f98e2b9bdf93a6be5ae3f7d8af858b43e02c5a2ff083726be40c1",
- "sha256:f643f331ab57ba3c9d89212ee4a2dabc6e94f117cf4eefde99a0574720d14c42"
+ "sha256:b3ed06a9e8ac9a9aae5a6f5dbe78a8a58655d17b43b93c078f094ddc476ae297",
+ "sha256:fa7bd7bd2771287c0de303af8bfdfc731f51bd2c6a47ab69d117138893b82717"
],
"markers": "python_version >= '3.6'",
- "version": "==2.13.0"
+ "version": "==2.14.0"
},
"pyparsing": {
"hashes": [
@@ -822,18 +829,18 @@
},
"python-twitter-v2": {
"hashes": [
- "sha256:18c14853da8b499775a11a3f5e1d0692a7017fa41eca91ac5afa73f35b935a90",
- "sha256:fbe582ae7c6b33f6055b97e23dd106874e6650091d257fe67bfd024b96ebf8d6"
+ "sha256:2397d518c17bfbc16a3d414b1cf6d3c231fd8d322f21c755ac2215c9ee675537",
+ "sha256:4e03a30b2570fa4f17fbc7293d850fb8276c66be106d55e460b9287de37e1dd2"
],
"index": "pypi",
- "version": "==0.8.0"
+ "version": "==0.8.1"
},
"pytz": {
"hashes": [
- "sha256:222439474e9c98fced559f1709d89e6c9cbf8d79c794ff3eb9f8800064291427",
- "sha256:e89512406b793ca39f5971bc999cc538ce125c0e51c27941bef4568b460095e2"
+ "sha256:7ccfae7b4b2c067464a6733c6261673fdb8fd1be905460396b97a073e9fa683a",
+ "sha256:93007def75ae22f7cd991c84e02d434876818661f8df9ad5df9e950ff4e52cfd"
],
- "version": "==2022.6"
+ "version": "==2022.7"
},
"pytz-deprecation-shim": {
"hashes": [
@@ -1009,11 +1016,11 @@
},
"rich": {
"hashes": [
- "sha256:a4eb26484f2c82589bd9a17c73d32a010b1e29d89f1604cd9bf3a2097b81bb5e",
- "sha256:ba3a3775974105c221d31141f2c116f4fd65c5ceb0698657a11e9f295ec93fd0"
+ "sha256:25f83363f636995627a99f6e4abc52ed0970ebbd544960cc63cbb43aaac3d6f0",
+ "sha256:41fe1d05f433b0f4724cda8345219213d2bfa472ef56b2f64f415b5b94d51b04"
],
- "markers": "python_version < '4' and python_full_version >= '3.6.3'",
- "version": "==12.6.0"
+ "markers": "python_version >= '3.7'",
+ "version": "==13.0.1"
},
"rsa": {
"hashes": [
@@ -1080,11 +1087,11 @@
},
"telethon": {
"hashes": [
- "sha256:148ac8c27908853d5d8a116d55ce947e9ba167bb697c75226ae95645b2e5a504",
- "sha256:de7a1619110a2c06390fb5340839c6503c6b108b5f1a2f3bbe1ef60f02cecacb"
+ "sha256:3ec7ea04e61e0179dd08b974b609814e1a5298eeda3d68368a34bba754f43aec",
+ "sha256:d894f6ef2bf2cb119f6413b9f620957503785bab0999694b4bf67dea36f8ee09"
],
"index": "pypi",
- "version": "==1.26.0"
+ "version": "==1.26.1"
},
"text-unidecode": {
"hashes": [
@@ -1289,11 +1296,11 @@
"develop": {
"autopep8": {
"hashes": [
- "sha256:8b1659c7f003e693199f52caffdc06585bb0716900bbc6a7442fd931d658c077",
- "sha256:ad924b42c2e27a1ac58e432166cc4588f5b80747de02d0d35b1ecbd3e7d57207"
+ "sha256:be5bc98c33515b67475420b7b1feafc8d32c1a69862498eda4983b45bffd2687",
+ "sha256:d27a8929d8dcd21c0f4b3859d2d07c6c25273727b98afc984c039df0f0d86566"
],
"index": "pypi",
- "version": "==2.0.0"
+ "version": "==2.0.1"
},
"pycodestyle": {
"hashes": [
@@ -1308,7 +1315,7 @@
"sha256:939de3e7a6161af0c887ef91b7d41a53e7c5a1ca976325f429cb46ea9bc30ecc",
"sha256:de526c12914f0c550d15924c62d72abc48d6fe7364aa87328337a31007fe8a4f"
],
- "markers": "python_version >= '3.7'",
+ "markers": "python_version < '3.11'",
"version": "==2.0.1"
}
}
diff --git a/src/configs/v2config.py b/src/configs/v2config.py
index 7260d41..5b47d0f 100644
--- a/src/configs/v2config.py
+++ b/src/configs/v2config.py
@@ -6,6 +6,7 @@ from typing import List
from archivers import Archiverv2
from feeders import Feeder
from databases import Database
+from formatters import Formatter
from storages import StorageV2
from steps.step import Step
from enrichers import Enricher
@@ -21,13 +22,14 @@ class ConfigV2:
Enricher,
Archiverv2,
Database,
- StorageV2
+ StorageV2,
+ Formatter
# Util
]
feeder: Step # TODO:= BaseFeeder
+ formatter: Formatter
archivers: List[Archiverv2] = field(default_factory=[]) # TODO: fix type
enrichers: List[Enricher] = field(default_factory=[])
- formatters: List[Step] = field(default_factory=[]) # TODO: fix type
storages: List[Step] = field(default_factory=[]) # TODO: fix type
databases: List[Database] = field(default_factory=[])
@@ -50,6 +52,7 @@ class ConfigV2:
for configurable in self.configurable_parents:
child: Step
for child in configurable.__subclasses__():
+ assert child.configs() is not None and type(child.configs()) == dict, f"class '{child.name}' should have a configs method returning a dict."
for config, details in child.configs().items():
assert "." not in child.name, f"class prop name cannot contain dots('.'): {child.name}"
assert "." not in config, f"config property cannot contain dots('.'): {config}"
@@ -87,6 +90,7 @@ class ConfigV2:
# print("config.py", self.config)
self.feeder = Feeder.init(steps.get("feeder", "cli_feeder"), self.config)
+ self.formatter = Formatter.init(steps.get("formatter", "html_formatter"), self.config)
self.enrichers = [Enricher.init(e, self.config) for e in steps.get("enrichers", [])]
self.archivers = [Archiverv2.init(e, self.config) for e in steps.get("archivers", [])]
self.databases = [Database.init(e, self.config) for e in steps.get("databases", [])]
@@ -97,6 +101,7 @@ class ConfigV2:
print("archivers", [e for e in self.archivers])
print("databases", [e for e in self.databases])
print("storages", [e for e in self.storages])
+ print("formatter", self.formatter)
def validate(self):
pass
diff --git a/src/formatters/__init__.py b/src/formatters/__init__.py
new file mode 100644
index 0000000..07a52a0
--- /dev/null
+++ b/src/formatters/__init__.py
@@ -0,0 +1,2 @@
+from .formatter import Formatter
+from .html_formatter import HtmlFormatter
\ No newline at end of file
diff --git a/src/formatters/formatter.py b/src/formatters/formatter.py
new file mode 100644
index 0000000..7199be2
--- /dev/null
+++ b/src/formatters/formatter.py
@@ -0,0 +1,21 @@
+from __future__ import annotations
+from dataclasses import dataclass
+from abc import abstractmethod
+from metadata import Metadata
+from steps.step import Step
+
+
+@dataclass
+class Formatter(Step):
+ name = "formatter"
+
+ def __init__(self, config: dict) -> None:
+ # without this STEP.__init__ is not called
+ super().__init__(config)
+
+ def init(name: str, config: dict) -> Formatter:
+ # only for code typing
+ return Step.init(name, config, Formatter)
+
+ @abstractmethod
+ def format(self, item) -> Metadata: return None
\ No newline at end of file
diff --git a/src/formatters/html_formatter.py b/src/formatters/html_formatter.py
new file mode 100644
index 0000000..6c278f5
--- /dev/null
+++ b/src/formatters/html_formatter.py
@@ -0,0 +1,36 @@
+from __future__ import annotations
+from dataclasses import dataclass
+from abc import abstractmethod
+from metadata import Metadata
+from media import Media
+from formatters import Formatter
+from jinja2 import Environment, FileSystemLoader
+import uuid, os, pathlib
+
+
+@dataclass
+class HtmlFormatter(Formatter):
+ name = "html_formatter"
+
+ def __init__(self, config: dict) -> None:
+ # without this STEP.__init__ is not called
+ super().__init__(config)
+ self.environment = Environment(loader=FileSystemLoader(os.path.join(pathlib.Path(__file__).parent.resolve(), "templates/")))
+ self.template = self.environment.get_template("html_template.html")
+
+ @staticmethod
+ def configs() -> dict:
+ return {}
+
+ def format(self, item: Metadata) -> Media:
+ print("FORMATTING")
+ content = self.template.render(
+ url=item.get_url(),
+ title=item.get_title(),
+ media=item.media,
+ metadata=item.get_clean_metadata()
+ )
+ html_path = os.path.join(item.get("tmp_dir"), f"formatted{str(uuid.uuid4())}.html")
+ with open(html_path, mode="w", encoding="utf-8") as outf:
+ outf.write(content)
+ return Media(filename=html_path)
diff --git a/src/formatters/templates/html_template.html b/src/formatters/templates/html_template.html
new file mode 100644
index 0000000..fa278eb
--- /dev/null
+++ b/src/formatters/templates/html_template.html
@@ -0,0 +1,101 @@
+{# templates/results.html #}
+
+
+
+
+
+
+
+ {{ url }}
+
+
+
+
+
+ {% if 'image' in m.mimetype %}
+
+ {% elif 'video' in m.mimetype %}
+
+ {% elif 'audio' in m.mimetype %}
+
+ {% else %}
+ No preview available, please open the link.
+ {% endif %}
+
+
+ {% endfor %}
+
+
metadata
+
+
+
key
+
value
+
+ {% for key in metadata %}
+
+
{{ key }}
+
{{ metadata[key] }}
+
+ {% endfor %}
+
+
+
+
+
\ No newline at end of file
diff --git a/src/media.py b/src/media.py
index c499b5b..58eae27 100644
--- a/src/media.py
+++ b/src/media.py
@@ -3,8 +3,7 @@ from __future__ import annotations
from ast import List
from typing import Any, Union, Dict
from dataclasses import dataclass
-from datetime import datetime
-import json
+import mimetypes
@dataclass
@@ -12,5 +11,11 @@ class Media:
filename: str
key: str = None
cdn_url: str = None
+ mimetype: str = None # eg: image/jpeg
# id: str = None
# hash: str = None # TODO: added by enrichers
+
+ def set_mimetype(self) -> Media:
+ if not self.mimetype:
+ self.mimetype = mimetypes.guess_type(self.filename)[0]
+ return self
diff --git a/src/metadata.py b/src/metadata.py
index f48c636..ceece8d 100644
--- a/src/metadata.py
+++ b/src/metadata.py
@@ -3,7 +3,8 @@ from __future__ import annotations
from ast import List, Set
from typing import Any, Union, Dict
from dataclasses import dataclass, field
-import datetime
+import datetime, mimetypes
+from loguru import logger
# import json
from media import Media
@@ -12,9 +13,11 @@ from media import Media
@dataclass
class Metadata:
status: str = ""
+ _processed_at: datetime = datetime.datetime.utcnow().replace(tzinfo=datetime.timezone.utc)
metadata: Dict[str, Any] = field(default_factory=dict)
tmp_keys: Set[str] = field(default_factory=set) # keys that are not to be saved in DBs
media: List[Media] = field(default_factory=list)
+ final_media: Media = None # can be overwritten by formatters
rearchivable: bool = False
# def __init__(self, url, metadata = {}) -> None:
@@ -85,13 +88,20 @@ class Metadata:
return ts
def add_media(self, media: Media) -> Metadata:
- # print(f"adding {filename} to {self.metadata.get('media')}")
- # return self.set("media", self.get_media() + [filename])
- # return self.get_media().append(media)
+ media.set_mimetype()
return self.media.append(media)
+ def set_final_media(self, final: Media) -> Metadata:
+ if final:
+ if self.final_media:
+ logger.warning(f"overwriting final media value :{self.final_media} with {final}")
+ final.set_mimetype()
+ self.final_media = final
+ return self
+
def get_single_media(self) -> Media:
- # TODO: check if formatters were applied and choose with priority
+ if self.final_media:
+ return self.final_media
return self.media[0]
# def as_json(self) -> str:
@@ -99,6 +109,12 @@ class Metadata:
# return json.dumps(self.metadata)
# #TODO: datetime is not serializable
+ def get_clean_metadata(self) -> Metadata:
+ return dict(
+ {k: v for k, v in self.metadata.items() if k not in self.tmp_keys},
+ **{"processed_at": self._processed_at} # TODO: move to enrichment
+ )
+
def cleanup(self) -> Metadata:
# TODO: refactor so it returns a JSON with all intended properties, except tmp_keys
# the code below leads to errors if database needs tmp_keys after they are removed
diff --git a/src/orchestrator.py b/src/orchestrator.py
index 3bc5ea7..5a8ff31 100644
--- a/src/orchestrator.py
+++ b/src/orchestrator.py
@@ -4,6 +4,8 @@ from typing import Union, Dict
from dataclasses import dataclass
from archivers import Archiverv2
+from feeders import Feeder
+from formatters import Formatter
from storages import StorageV2
from enrichers import Enricher
from databases import Database
@@ -13,7 +15,6 @@ import tempfile, time, traceback
from loguru import logger
-
"""
how not to couple the different pieces of logic
due to the use of constants for the metadata keys?
@@ -132,7 +133,8 @@ class ArchivingOrchestrator:
# Archiver.init(a, config)
# for a in config.archivers
# ]
- self.feeder = config.feeder
+ self.feeder : Feeder = config.feeder
+ self.formatter : Formatter = config.formatter
self.enrichers = config.enrichers
self.archivers: List[Archiverv2] = config.archivers
self.databases: List[Database] = config.databases
@@ -237,14 +239,21 @@ class ArchivingOrchestrator:
for e in self.enrichers:
result.merge(e.enrich(result))
- # formatters, enrichers, and storages will sometimes look for specific properties: eg
Screenshot:
- for f in self.formatters:
- result.merge(f.format(result))
-
- # storage
+ # store media
+ unstored_media = result.media[::]
+ result.media = []
for s in self.storages:
- for i, m in enumerate(result.media):
- result.media[i] = s.store(m, result)
+ for m in unstored_media:
+ result.media.append(s.store(m, result))
+
+ # formatters, enrichers, and storages will sometimes look for specific properties: eg
Screenshot:
+ # TODO: should there only be 1 formatter?
+ # for f in self.formatters:
+ # result.merge(f.format(result))
+ # final format and store it
+ if (final_media := self.formatter.format(result)):
+ for s in self.storages:
+ result.set_final_media(s.store(final_media, result))
# signal completion to databases (DBs, Google Sheets, CSV, ...)
# a hash registration service could be one database: forensic archiving
diff --git a/src/steps/gsheet.py b/src/steps/gsheet.py
index 6bfb5d7..262add1 100644
--- a/src/steps/gsheet.py
+++ b/src/steps/gsheet.py
@@ -12,6 +12,7 @@ class Gsheets(Step):
super().__init__(config)
self.gsheets_client = gspread.service_account(filename=self.service_account)
assert type(self.header) == int, f"header ({self.header}) value must be an integer not {type(self.header)}"
+ assert self.sheet is not None, "You need to define a sheet name in your orchestration file when using gsheets."
@staticmethod
def configs() -> dict:
From 0cb593fd2169f3134c47cd0446cab7369d7d6262 Mon Sep 17 00:00:00 2001
From: msramalho <19508417+msramalho@users.noreply.github.com>
Date: Wed, 11 Jan 2023 00:03:47 +0000
Subject: [PATCH 033/190] wayback enricher ready
---
src/archivers/telethon_archiverv2.py | 2 +-
src/databases/gsheet_db.py | 6 +-
src/enrichers/__init__.py | 3 +-
src/enrichers/enricher.py | 2 +-
src/enrichers/screenshot_enricher.py | 23 ++++---
src/enrichers/wayback_enricher.py | 68 +++++++++++++++++++
src/formatters/html_formatter.py | 2 +-
src/formatters/templates/html_template.html | 7 +-
src/media.py | 5 +-
src/metadata.py | 18 ++++-
src/orchestrator.py | 75 ++-------------------
11 files changed, 121 insertions(+), 90 deletions(-)
create mode 100644 src/enrichers/wayback_enricher.py
diff --git a/src/archivers/telethon_archiverv2.py b/src/archivers/telethon_archiverv2.py
index ea19c92..66ecd74 100644
--- a/src/archivers/telethon_archiverv2.py
+++ b/src/archivers/telethon_archiverv2.py
@@ -121,7 +121,7 @@ class TelethonArchiver(Archiverv2):
media_posts = self._get_media_posts_in_group(chat, post)
logger.debug(f'got {len(media_posts)=} for {url=}')
- tmp_dir = item.get("tmp_dir")
+ tmp_dir = item.get_tmp_dir()
group_id = post.grouped_id if post.grouped_id is not None else post.id
title = post.message
diff --git a/src/databases/gsheet_db.py b/src/databases/gsheet_db.py
index ba3785a..26aae68 100644
--- a/src/databases/gsheet_db.py
+++ b/src/databases/gsheet_db.py
@@ -68,13 +68,15 @@ class GsheetsDb(Database):
batch_if_valid('title', item.get_title())
batch_if_valid('text', item.get("content", "")[:500])
batch_if_valid('timestamp', item.get_timestamp())
+ if (screenshot := item.get_media_by_id("screenshot")):
+ batch_if_valid('screenshot', screenshot.cdn_url)
+ # batch_if_valid('status', item.status)
# TODO: AFTER ENRICHMENTS
# batch_if_valid('hash', media.hash)
# batch_if_valid('thumbnail', result.thumbnail, f'=IMAGE("{result.thumbnail}")')
# batch_if_valid('thumbnail_index', result.thumbnail_index)
# batch_if_valid('duration', result.duration, str(result.duration))
- # batch_if_valid('screenshot', result.screenshot)
# if result.wacz is not None:
# batch_if_valid('wacz', result.wacz)
# batch_if_valid('replaywebpage', f'https://replayweb.page/?source={quote(result.wacz)}#view=pages&url={quote(url)}')
@@ -91,5 +93,5 @@ class GsheetsDb(Database):
def _retrieve_gsheet(self, item: Metadata) -> Tuple[GWorksheet, int]:
gw: GWorksheet = item.get("gsheet").get("worksheet")
row: int = item.get("gsheet").get("row")
- #TODO: to make gsheet_db less coupled with gsheet_feeder's "gsheet" parameter, this method could 1st try to fetch "gsheet" from item and, if missing, manage its own singleton - not needed for now
+ # TODO: to make gsheet_db less coupled with gsheet_feeder's "gsheet" parameter, this method could 1st try to fetch "gsheet" from item and, if missing, manage its own singleton - not needed for now
return gw, row
diff --git a/src/enrichers/__init__.py b/src/enrichers/__init__.py
index 503ea2c..2a871d1 100644
--- a/src/enrichers/__init__.py
+++ b/src/enrichers/__init__.py
@@ -1,2 +1,3 @@
from .enricher import Enricher
-from .screenshot_enricher import ScreenshotEnricher
\ No newline at end of file
+from .screenshot_enricher import ScreenshotEnricher
+from .wayback_enricher import WaybackEnricher
\ No newline at end of file
diff --git a/src/enrichers/enricher.py b/src/enrichers/enricher.py
index faf43d8..9d11276 100644
--- a/src/enrichers/enricher.py
+++ b/src/enrichers/enricher.py
@@ -18,4 +18,4 @@ class Enricher(Step, ABC):
return Step.init(name, config, Enricher)
@abstractmethod
- def enrich(self, item: Metadata) -> Metadata: pass
+ def enrich(self, to_enrich: Metadata) -> None: pass
diff --git a/src/enrichers/screenshot_enricher.py b/src/enrichers/screenshot_enricher.py
index 5018859..b008e52 100644
--- a/src/enrichers/screenshot_enricher.py
+++ b/src/enrichers/screenshot_enricher.py
@@ -1,13 +1,14 @@
+from media import Media
from utils import Webdriver
from . import Enricher
from metadata import Metadata
from loguru import logger
+import time, uuid, os
from selenium.common.exceptions import TimeoutException
-import time
class ScreenshotEnricher(Enricher):
- name = "screenshot"
+ name = "screenshot_enricher"
@staticmethod
def configs() -> dict:
@@ -17,16 +18,18 @@ class ScreenshotEnricher(Enricher):
"timeout": {"default": 60, "help": "timeout for taking the screenshot"}
}
- def enrich(self, item: Metadata) -> Metadata:
- url = self.get_url(item)
- print(f"enriching {url=}")
- with Webdriver(self.width, self.height, self.timeout, 'facebook.com' in url) as driver: # TODO: make a util
+ def enrich(self, to_enrich: Metadata) -> None:
+ url = to_enrich.get_url()
+ logger.debug(f"Enriching screenshot for {url=}")
+ with Webdriver(self.width, self.height, self.timeout, 'facebook.com' in url) as driver:
try:
driver.get(url)
time.sleep(2)
+ screenshot_file = os.path.join(to_enrich.get_tmp_dir(), f"screenshot_{str(uuid.uuid4())[0:8]}.png")
+ driver.save_screenshot(screenshot_file)
+ to_enrich.add_media(Media(filename=screenshot_file, id="screenshot"))
except TimeoutException:
logger.info("TimeoutException loading page for screenshot")
-
- #TODO: return saved object
- driver.save_screenshot("TODO-HASH_OR_UUID.png")
- return None
+ except Exception as e:
+ logger.error(f"Got error while loading webdriver for screenshot enricher: {e}")
+ # return None
diff --git a/src/enrichers/wayback_enricher.py b/src/enrichers/wayback_enricher.py
new file mode 100644
index 0000000..09a43e0
--- /dev/null
+++ b/src/enrichers/wayback_enricher.py
@@ -0,0 +1,68 @@
+from utils import Webdriver
+from . import Enricher
+from metadata import Metadata
+from loguru import logger
+from selenium.common.exceptions import TimeoutException
+import time, requests
+
+
+class WaybackEnricher(Enricher):
+ """
+ Submits the current URL to the webarchive and returns a job_id or completed archive
+ """
+ name = "wayback_enricher"
+
+ def __init__(self, config: dict) -> None:
+ # without this STEP.__init__ is not called
+ super().__init__(config)
+ assert type(self.secret) == str and len(self.secret) > 0, "please provide a value for the wayback_enricher API key"
+ assert type(self.secret) == str and len(self.secret) > 0, "please provide a value for the wayback_enricher API secret"
+
+ @staticmethod
+ def configs() -> dict:
+ return {
+ "timeout": {"default": 5, "help": "number of seconds to wait for a response from webarchive's wayback machine, after that only job_id is saved but page will still be processed."},
+ "key": {"default": None, "help": "wayback API key. to get credentials visit https://archive.org/account/s3.php"},
+ "secret": {"default": None, "help": "wayback API secret. to get credentials visit https://archive.org/account/s3.php"}
+ }
+
+ def enrich(self, to_enrich: Metadata) -> None:
+ url = to_enrich.get_url()
+ logger.debug(f"Enriching wayback for {url=}")
+
+ ia_headers = {
+ "Accept": "application/json",
+ "Authorization": f"LOW {self.key}:{self.secret}"
+ }
+ r = requests.post('https://web.archive.org/save/', headers=ia_headers, data={'url': url})
+
+ if r.status_code != 200:
+ logger.error(em:=f"Internet archive failed with status of {r.status_code}: {r.json()}")
+ to_enrich.set("wayback", em)
+ return
+
+ # check job status
+ job_id = r.json()['job_id']
+
+ # waits at most timeout seconds until job is completed, otherwise only enriches the job_id information
+ start_time = time.time()
+ wayback_url = False
+ attempt = 1
+ while not wayback_url and time.time() - start_time <= self.timeout:
+ try:
+
+ logger.debug(f"GETting status for {job_id=} on {url=} ({attempt=})")
+ r_status = requests.get(f'https://web.archive.org/save/status/{job_id}', headers=ia_headers)
+ r_json = r_status.json()
+ if r_status.status_code == 200 and r_json['status'] == 'success':
+ wayback_url = f"https://web.archive.org/web/{r_json['timestamp']}/{r_json['original_url']}"
+ except Exception as e:
+ logger.warning(f"error fetching status for {url=} due to: {e}")
+ if not wayback_url:
+ attempt += 1
+ time.sleep(1) # TODO: can be improved with exponential backoff
+
+ if wayback_url:
+ to_enrich.set("wayback", wayback_url)
+ else:
+ to_enrich.set("wayback", {"job_id": job_id, "check_status": f'https://web.archive.org/save/status/{job_id}'})
diff --git a/src/formatters/html_formatter.py b/src/formatters/html_formatter.py
index 6c278f5..7443568 100644
--- a/src/formatters/html_formatter.py
+++ b/src/formatters/html_formatter.py
@@ -30,7 +30,7 @@ class HtmlFormatter(Formatter):
media=item.media,
metadata=item.get_clean_metadata()
)
- html_path = os.path.join(item.get("tmp_dir"), f"formatted{str(uuid.uuid4())}.html")
+ html_path = os.path.join(item.get_tmp_dir(), f"formatted{str(uuid.uuid4())}.html")
with open(html_path, mode="w", encoding="utf-8") as outf:
outf.write(content)
return Media(filename=html_path)
diff --git a/src/formatters/templates/html_template.html b/src/formatters/templates/html_template.html
index fa278eb..fc986f0 100644
--- a/src/formatters/templates/html_template.html
+++ b/src/formatters/templates/html_template.html
@@ -60,6 +60,9 @@
{% endif %}
key: {{ m.key }}
type: {{ m.mimetype }}
+ {% if m.id | length >0 %}
+
id: {{ m.id }}
+ {% endif %}
@@ -91,11 +94,13 @@
{% for key in metadata %}
{{ key }}
-
{{ metadata[key] }}
+
{{ metadata[key] | urlize }}
{% endfor %}
+
+
made with bellingcat/auto-archiver, add suggestions and report issues on the project's github page
\ No newline at end of file
diff --git a/src/media.py b/src/media.py
index 58eae27..3c416be 100644
--- a/src/media.py
+++ b/src/media.py
@@ -12,10 +12,13 @@ class Media:
key: str = None
cdn_url: str = None
mimetype: str = None # eg: image/jpeg
- # id: str = None
+ id: str = None # in case this type of media needs a special id, eg: screenshot
# hash: str = None # TODO: added by enrichers
def set_mimetype(self) -> Media:
if not self.mimetype:
self.mimetype = mimetypes.guess_type(self.filename)[0]
return self
+
+ def is_video(self) -> bool:
+ return self.mimetype.startswith("video")
diff --git a/src/metadata.py b/src/metadata.py
index ceece8d..7af923c 100644
--- a/src/metadata.py
+++ b/src/metadata.py
@@ -28,9 +28,12 @@ class Metadata:
"""
merges two Metadata instances, will overwrite according to overwrite_left flag
"""
+ if right is None: return self
if overwrite_left:
- self.status = right.status
+ if right.status and len(right.status):
+ self.status = right.status
self.rearchivable |= right.rearchivable
+ self.tmp_keys |= right.tmp_keys
for k, v in right.metadata.items():
assert k not in self.metadata or type(v) == type(self.get(k))
if type(v) not in [dict, list, set] or k not in self.metadata:
@@ -76,6 +79,12 @@ class Metadata:
def get_title(self) -> str:
return self.get("title")
+ def set_tmp_dir(self, tmp_dir: str) -> Metadata:
+ return self.set("tmp_dir", tmp_dir, True)
+
+ def get_tmp_dir(self) -> str:
+ return self.get("tmp_dir")
+
def set_timestamp(self, timestamp: datetime.datetime) -> Metadata:
assert type(timestamp) == datetime.datetime, "set_timestamp expects a datetime instance"
return self.set("timestamp", timestamp)
@@ -88,9 +97,15 @@ class Metadata:
return ts
def add_media(self, media: Media) -> Metadata:
+ if media is None: return
media.set_mimetype()
return self.media.append(media)
+ def get_media_by_id(self, id:str) -> Media:
+ for m in self.media:
+ if m.id == id: return m
+ return None
+
def set_final_media(self, final: Media) -> Metadata:
if final:
if self.final_media:
@@ -100,6 +115,7 @@ class Metadata:
return self
def get_single_media(self) -> Media:
+ #TODO: could be refactored to use a custom media.id
if self.final_media:
return self.final_media
return self.media[0]
diff --git a/src/orchestrator.py b/src/orchestrator.py
index 5a8ff31..3d554e0 100644
--- a/src/orchestrator.py
+++ b/src/orchestrator.py
@@ -52,74 +52,6 @@ Cisticola considerations:
2. So the auto-archiver becomes like a puzzle and fixes to Cisticola scrapers can immediately benefit it, and contributions are focused on a single source or scraping
"""
-# @dataclass
-# class Metadata:
-# # does not handle files, only primitives
-# # the only piece of logic to handle files is the archiver, enricher, and storage
-# status: str
-# # title: str
-# # url: str
-# # hash: str
-# main_file: Metadata
-# metadata: Dict[str, Metadata]
-
-# @staticmethod
-# def merge(left, right : Metadata, overwrite_left=True) -> Metadata:
-# # should return a merged version of the Metadata
-# # will work for archived() and enriched()
-# # what if 2 metadatas contain the same keys? only one can remain! : overwrite_left
-# pass
-
-# def get(self, key) -> Union[Metadata, str]:
-# # goes through metadata and returns the Metadata available
-# pass
-
-# def as_json(self) -> str:
-# # converts all metadata and data into JSON
-# pass
-
-
-"""
-@dataclass
-class ArchiveResult:
- # maybe metadata can have status as well, eg: screenshot fails. should that be registered in the databases? likely yes
- status: str
- url: str
- metadata: Metadata
- # title, url, hash, other={}
- # cdn_url: str = None
- # thumbnail: str = None
- # thumbnail_index: str = None
- # duration: float = None
- # title: str = None
- # timestamp: datetime.datetime = None
- # screenshot: str = None
- # wacz: str = None
- # hash: str = None
- # media: list = field(default_factory=list)
-
- def __init__(self) -> None: pass
-
- def update(self, metadata) -> None:
- # receive a Metadata instance and update itself with it!
- pass
-
- def as_json(self) -> str:
- # converts all metadata and data into JSON
- pass
-"""
-
-"""
-There is a Superclass for:
- * Database (should_process)
-
-How can GSheets work? it needs to feed from a READER (GSheets Feeder)
-
-Once an archiver returns a link to a local file (for eg to a storage), how do we then delete the produced local files?
-The context metadata should include a temporary folder (maybe a LocalStorage instance?)
-"""
-
-
class ArchivingOrchestrator:
def __init__(self, config) -> None:
# in config.py we should test that the archivers exist and log mismatches (blocking execution)
@@ -128,7 +60,7 @@ class ArchivingOrchestrator:
# Is it possible to overwrite config.yaml values? it could be useful: share config file and modify gsheet_feeder.sheet via CLI
# where does that update/processing happen? in config.py
- # reflection for Archiver to know wihch child classes it has? use Archiver.__subclasses__
+ # reflection for Archiver to know which child classes it has? use Archiver.__subclasses__
# self.archivers = [
# Archiver.init(a, config)
# for a in config.archivers
@@ -166,7 +98,7 @@ class ArchivingOrchestrator:
print("ARCHIVING", item)
try:
with tempfile.TemporaryDirectory(dir="./") as tmp_dir:
- item.set("tmp_dir", tmp_dir, True)
+ item.set_tmp_dir(tmp_dir)
result = self.archive(item)
print(result)
except KeyboardInterrupt:
@@ -226,6 +158,7 @@ class ArchivingOrchestrator:
# do they need to be refreshed with every execution?
# this is where the Hashes come from, the place with access to all content
# the archiver does not have access to storage
+ # a.download(result) # TODO: refactor so there's not merge here
result.merge(a.download(result))
# TODO: fix logic
if True or result.is_success(): break
@@ -237,7 +170,7 @@ class ArchivingOrchestrator:
# maybe as a PDF? or a Markdown file
# side captures: screenshot, wacz, webarchive, thumbnails, HTMLgenerator
for e in self.enrichers:
- result.merge(e.enrich(result))
+ e.enrich(result)
# store media
unstored_media = result.media[::]
From 6ca46417feeda7f6ac586214cbf40917f9d9b50f Mon Sep 17 00:00:00 2001
From: msramalho <19508417+msramalho@users.noreply.github.com>
Date: Thu, 12 Jan 2023 02:09:39 +0000
Subject: [PATCH 034/190] local storage + multiple storage support
---
src/archivers/telethon_archiverv2.py | 4 +-
src/databases/gsheet_db.py | 4 +-
src/enrichers/wayback_enricher.py | 2 +-
src/formatters/templates/html_template.html | 19 +++++----
src/media.py | 28 +++++++++----
src/metadata.py | 2 -
src/orchestrator.py | 14 +++----
src/storages/__init__.py | 3 +-
src/storages/local.py | 46 +++++++++++++++++++++
src/storages/s3.py | 25 +++++------
src/storages/storage.py | 24 ++++++-----
11 files changed, 117 insertions(+), 54 deletions(-)
create mode 100644 src/storages/local.py
diff --git a/src/archivers/telethon_archiverv2.py b/src/archivers/telethon_archiverv2.py
index 66ecd74..6851cb5 100644
--- a/src/archivers/telethon_archiverv2.py
+++ b/src/archivers/telethon_archiverv2.py
@@ -132,11 +132,11 @@ class TelethonArchiver(Archiverv2):
if mp.entities:
other_media_urls = [e.url for e in mp.entities if hasattr(e, "url") and e.url and self._guess_file_type(e.url) in ["video", "image", "audio"]]
if len(other_media_urls):
- logger.debug(f"Got {len(other_media_urls)} other medial urls from {mp.id=}: {other_media_urls}")
+ logger.debug(f"Got {len(other_media_urls)} other media urls from {mp.id=}: {other_media_urls}")
for i, om_url in enumerate(other_media_urls):
filename = os.path.join(tmp_dir, f'{chat}_{group_id}_{i}')
self.download_from_url(om_url, filename)
- result.add_media(Media(filename))
+ result.add_media(Media(filename=filename, id=f"{group_id}_{i}"))
filename_dest = os.path.join(tmp_dir, f'{chat}_{group_id}', str(mp.id))
filename = self.client.download_media(mp.media, filename_dest)
diff --git a/src/databases/gsheet_db.py b/src/databases/gsheet_db.py
index 26aae68..0cf65ed 100644
--- a/src/databases/gsheet_db.py
+++ b/src/databases/gsheet_db.py
@@ -63,13 +63,13 @@ class GsheetsDb(Database):
media: Media = item.get_single_media()
- batch_if_valid('archive', media.cdn_url)
+ batch_if_valid('archive', "\n".join(media.urls))
batch_if_valid('date', True, datetime.datetime.utcnow().replace(tzinfo=datetime.timezone.utc).isoformat())
batch_if_valid('title', item.get_title())
batch_if_valid('text', item.get("content", "")[:500])
batch_if_valid('timestamp', item.get_timestamp())
if (screenshot := item.get_media_by_id("screenshot")):
- batch_if_valid('screenshot', screenshot.cdn_url)
+ batch_if_valid('screenshot', "\n".join(screenshot.urls))
# batch_if_valid('status', item.status)
# TODO: AFTER ENRICHMENTS
diff --git a/src/enrichers/wayback_enricher.py b/src/enrichers/wayback_enricher.py
index 09a43e0..bf55923 100644
--- a/src/enrichers/wayback_enricher.py
+++ b/src/enrichers/wayback_enricher.py
@@ -21,7 +21,7 @@ class WaybackEnricher(Enricher):
@staticmethod
def configs() -> dict:
return {
- "timeout": {"default": 5, "help": "number of seconds to wait for a response from webarchive's wayback machine, after that only job_id is saved but page will still be processed."},
+ "timeout": {"default": 15, "help": "seconds to wait for successful archive confirmation from wayback, if more than this passes the result contains the job_id so the status can later be checked manually."},
"key": {"default": None, "help": "wayback API key. to get credentials visit https://archive.org/account/s3.php"},
"secret": {"default": None, "help": "wayback API secret. to get credentials visit https://archive.org/account/s3.php"}
}
diff --git a/src/formatters/templates/html_template.html b/src/formatters/templates/html_template.html
index fc986f0..f488a5f 100644
--- a/src/formatters/templates/html_template.html
+++ b/src/formatters/templates/html_template.html
@@ -26,6 +26,7 @@
table,
th,
td {
+ margin: auto;
border: 1px solid;
border-collapse: collapse;
}
@@ -43,18 +44,17 @@