From b680700b22784b45db0375fafaeacb1e26f775ae Mon Sep 17 00:00:00 2001 From: msramalho <19508417+msramalho@users.noreply.github.com> Date: Tue, 3 May 2022 20:32:23 +0200 Subject: [PATCH 01/84] ignoring config file --- .gitignore | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index 9d83858..c61326d 100644 --- a/.gitignore +++ b/.gitignore @@ -8,4 +8,5 @@ __pycache__/ anu.html *.log .pytest_cach -anon* \ No newline at end of file +anon* +config.json \ No newline at end of file From 24340190af6253cc10cea1547539c5feb0efc223 Mon Sep 17 00:00:00 2001 From: msramalho <19508417+msramalho@users.noreply.github.com> Date: Tue, 3 May 2022 20:32:53 +0200 Subject: [PATCH 02/84] s3 storage config refactor --- storages/s3_storage.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/storages/s3_storage.py b/storages/s3_storage.py index d7c9644..084153d 100644 --- a/storages/s3_storage.py +++ b/storages/s3_storage.py @@ -11,6 +11,8 @@ class S3Config: key: str secret: str folder: str = "" + endpoint_url: str = "https://{region}.digitaloceanspaces.com" + cdn_url: str = "https://{bucket}.{region}.cdn.digitaloceanspaces.com/{key}" private: bool = False From 03a6611c862fe7d547ae1aff4835fd7dd0d3b575 Mon Sep 17 00:00:00 2001 From: msramalho <19508417+msramalho@users.noreply.github.com> Date: Tue, 3 May 2022 20:33:02 +0200 Subject: [PATCH 03/84] adds local storage --- storages/__init__.py | 1 + storages/local_storage.py | 20 ++++++++++++++++++++ 2 files changed, 21 insertions(+) create mode 100644 storages/local_storage.py diff --git a/storages/__init__.py b/storages/__init__.py index 3054d36..8b39d7d 100644 --- a/storages/__init__.py +++ b/storages/__init__.py @@ -1,3 +1,4 @@ # we need to explicitly expose the available imports here from .base_storage import * +from .local_storage import * from .s3_storage import * \ No newline at end of file diff --git a/storages/local_storage.py b/storages/local_storage.py new file mode 100644 index 0000000..0dcdaef --- /dev/null +++ b/storages/local_storage.py @@ -0,0 +1,20 @@ +import os +from .base_storage import Storage + + +class LocalStorage(Storage): + def __init__(self, folder): + self.folder = folder + if len(self.folder) and self.folder[-1] != '/': + self.folder += '/' + + def get_cdn_url(self, key): + return self.folder + key + + def exists(self, key): + return os.path.isfile(self.get_cdn_url(key)) + + def uploadf(self, file, key, **kwargs): + path = self.get_cdn_url(key) + with open(path, "wb") as outf: + outf.write(file.read()) From a7948ac768190462fba15d100ac68034a6b07692 Mon Sep 17 00:00:00 2001 From: msramalho <19508417+msramalho@users.noreply.github.com> Date: Tue, 3 May 2022 20:33:19 +0200 Subject: [PATCH 04/84] extract telegram config --- archivers/telethon_archiver.py | 7 +------ configs/telegram_config.py | 7 +++++++ 2 files changed, 8 insertions(+), 6 deletions(-) create mode 100644 configs/telegram_config.py diff --git a/archivers/telethon_archiver.py b/archivers/telethon_archiver.py index 88bec58..08b7dec 100644 --- a/archivers/telethon_archiver.py +++ b/archivers/telethon_archiver.py @@ -1,19 +1,14 @@ import os import re import html -from dataclasses import dataclass from loguru import logger from storages import Storage from .base_archiver import Archiver, ArchiveResult from telethon.sync import TelegramClient +from configs import TelegramConfig -@dataclass -class TelegramConfig: - api_id: str - api_hash: str - class TelethonArchiver(Archiver): name = "telethon" diff --git a/configs/telegram_config.py b/configs/telegram_config.py new file mode 100644 index 0000000..f5553ad --- /dev/null +++ b/configs/telegram_config.py @@ -0,0 +1,7 @@ + +from dataclasses import dataclass + +@dataclass +class TelegramConfig: + api_id: str + api_hash: str \ No newline at end of file From ac9ed1a0d70e1254d0d22d55dab5c2d94c081e72 Mon Sep 17 00:00:00 2001 From: msramalho <19508417+msramalho@users.noreply.github.com> Date: Tue, 3 May 2022 20:33:38 +0200 Subject: [PATCH 05/84] extract wayback config --- archivers/wayback_archiver.py | 7 +++++++ configs/wayback_config.py | 7 +++++++ 2 files changed, 14 insertions(+) create mode 100644 configs/wayback_config.py diff --git a/archivers/wayback_archiver.py b/archivers/wayback_archiver.py index 1fa98aa..b1e6824 100644 --- a/archivers/wayback_archiver.py +++ b/archivers/wayback_archiver.py @@ -1,10 +1,17 @@ import time, requests, os from bs4 import BeautifulSoup +# from dataclasses import dataclass from storages import Storage from .base_archiver import Archiver, ArchiveResult +# @dataclass +# class WaybackConfig: +# key: str +# secret: str +from configs import WaybackConfig +# TODO: use WaybackConfig class WaybackArchiver(Archiver): name = "wayback" diff --git a/configs/wayback_config.py b/configs/wayback_config.py new file mode 100644 index 0000000..7f455ee --- /dev/null +++ b/configs/wayback_config.py @@ -0,0 +1,7 @@ + +from dataclasses import dataclass + +@dataclass +class WaybackConfig: + key: str + secret: str \ No newline at end of file From f00e31c23d15a7b6a8663c2e98e6ebc427209e19 Mon Sep 17 00:00:00 2001 From: msramalho <19508417+msramalho@users.noreply.github.com> Date: Tue, 3 May 2022 20:33:54 +0200 Subject: [PATCH 06/84] introduce config.py --- archivers/__init__.py | 14 ++--- configs/__init__.py | 3 ++ configs/config.py | 123 ++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 133 insertions(+), 7 deletions(-) create mode 100644 configs/__init__.py create mode 100644 configs/config.py diff --git a/archivers/__init__.py b/archivers/__init__.py index c272301..26979c0 100644 --- a/archivers/__init__.py +++ b/archivers/__init__.py @@ -1,8 +1,8 @@ # we need to explicitly expose the available imports here -from .base_archiver import * -from .telegram_archiver import * -from .telethon_archiver import * -from .tiktok_archiver import * -from .wayback_archiver import * -from .youtubedl_archiver import * -from .twitter_archiver import * \ No newline at end of file +from .base_archiver import Archiver, ArchiveResult +from .telegram_archiver import TelegramArchiver +from .telethon_archiver import TelethonArchiver, TelegramConfig +from .tiktok_archiver import TiktokArchiver +from .wayback_archiver import WaybackArchiver, WaybackConfig +from .youtubedl_archiver import YoutubeDLArchiver +from .twitter_archiver import TwitterArchiver \ No newline at end of file diff --git a/configs/__init__.py b/configs/__init__.py new file mode 100644 index 0000000..d7d3283 --- /dev/null +++ b/configs/__init__.py @@ -0,0 +1,3 @@ +from .config import Config +from .wayback_config import WaybackConfig +from .telegram_config import TelegramConfig \ No newline at end of file diff --git a/configs/config.py b/configs/config.py new file mode 100644 index 0000000..b697e13 --- /dev/null +++ b/configs/config.py @@ -0,0 +1,123 @@ + +import argparse, json +import gspread +from loguru import logger +from selenium import webdriver + +from utils.gworksheet import GWorksheet +from storages import S3Config +from .wayback_config import WaybackConfig +from .telegram_config import TelegramConfig + +class Config: + """ + Controls the current execution parameters and manages API configurations + """ + + def __init__(self): + self.parser = self.get_argument_parser() + + def parse(self): + self.args = self.parser.parse_args() + logger.success(f'Command line arguments parsed successfully') + self.config_file = self.args.config + self.read_config_json() + logger.info(f'APIs and Services initialized:\n{self}') + + def read_config_json(self): + with open(self.config_file, "r", encoding="utf-8") as inf: + self.config = json.load(inf) + + execution = self.config.get("execution", {}) + + # general sheet configurations + self.sheet = getattr(self.args, "sheet") or execution.get("sheet") + assert self.sheet is not None, "'sheet' must be provided either through command line or configuration file" + + self.header = int(getattr(self.args, "header") or execution.get("header", 1)) + self.tmp_folder = execution.get("tmp_folder", "tmp/") + + self.storage = execution.get("storage", "s3") + + # Column names come from config and can be overwritten by CMD + # in the end all are considered as lower case + config_column_names = execution.get("column_names", {}) + self.column_names = {} + for k in GWorksheet.COLUMN_NAMES.keys(): + self.column_names[k] = getattr(self.args, k) or config_column_names.get(k) or GWorksheet.COLUMN_NAMES[k] + self.column_names = {k: v.lower() for k, v in self.column_names.items()} + + # selenium driver + selenium_configs = execution.get("selenium", {}) + self.selenium_timeout = int(selenium_configs.get("timeout_seconds", 10)) + options = webdriver.FirefoxOptions() + options.headless = True + options.set_preference('network.protocol-handler.external.tg', False) + self.webdriver = webdriver.Firefox(options=options) + self.webdriver.set_window_size(1400, 2000) + self.webdriver.set_page_load_timeout(self.selenium_timeout) + + # APIs and service configurations + if "s3" in self.config: + s3 = self.config["s3"] + self.s3_config = S3Config( + bucket=s3["bucket"], + region=s3["region"], + key=s3["key"], + secret=s3["secret"] + ) + self.s3_config.private = getattr(self.args, "private") or s3["private"] or self.s3_config.private + self.s3_config.endpoint_url = s3["endpoint_url"] or self.s3_config.endpoint_url + self.s3_config.cdn_url = s3["cdn_url"] or self.s3_config.cdn_url + else: + logger.debug(f"'s3' key not present in the {self.config_file=}") + + if "wayback" in self.config: + self.wayback_config = WaybackConfig( + key=self.config["wayback"]["key"], + secret=self.config["wayback"]["secret"], + ) + else: + logger.debug(f"'wayback' key not present in the {self.config_file=}") + + if "telegram" in self.config: + self.telegram_config = TelegramConfig( + api_id=self.config["telegram"]["api_id"], + api_hash=self.config["telegram"]["api_hash"] + ) + else: + logger.debug(f"'telegram' key not present in the {self.config_file=}") + + self.gsheets_client = gspread.service_account( + filename=self.config.get("google_api", {}).get("filename", 'service_account.json') + ) + + + def get_argument_parser(self): + parser = argparse.ArgumentParser(description='Automatically archive social media videos from a Google Sheets document') + + parser.add_argument('--config', action='store', dest='config', help='the filename of the JSON configuration file (defaults to \'config.json\')', default='config.json') + parser.add_argument('--sheet', action='store', dest='sheet', help='the name of the google sheets document [execution.sheet in config.json]') + parser.add_argument('--header', action='store', dest='header', help='1-based index for the header row [execution.header in config.json]') + parser.add_argument('--private', action='store_true', help='Store content without public access permission [execution.header in config.json]') + + for k, v in GWorksheet.COLUMN_NAMES.items(): + parser.add_argument(f'--col-{k}', action='store', dest=k, help=f'the name of the column to fill with {k} (default={v})') + + return parser + + def __str__(self) -> str: + return json.dumps({ + "config_file": self.config_file, + "sheet": self.sheet, + "header": self.header, + "tmp_folder": self.tmp_folder, + "selenium_timeout_seconds": self.selenium_timeout, + "selenium_webdriver": self.webdriver != None, + "s3_config": self.s3_config != None, + "s3_private": getattr(self.s3_config, "private", None), + "wayback_config": self.wayback_config != None, + "telegram_config": self.telegram_config != None, + "gsheets_client": self.gsheets_client != None, + "column_names": self.column_names, + }, ensure_ascii=False, indent=4) From f592c7fcfe245c2c6fd40a487c12ab7ae63f627e Mon Sep 17 00:00:00 2001 From: msramalho <19508417+msramalho@users.noreply.github.com> Date: Tue, 3 May 2022 20:34:04 +0200 Subject: [PATCH 07/84] refactor to use config.py --- auto_archive.py | 101 ++++++++++++++++++++++-------------------------- 1 file changed, 47 insertions(+), 54 deletions(-) diff --git a/auto_archive.py b/auto_archive.py index d3db9a2..6fc41f1 100644 --- a/auto_archive.py +++ b/auto_archive.py @@ -1,22 +1,24 @@ -import os +# import os import datetime -import argparse +# import argparse import requests import shutil -import gspread +# import gspread from loguru import logger from dotenv import load_dotenv -from selenium import webdriver +# from selenium import webdriver import traceback -import archivers -from storages import S3Storage, S3Config +# import archivers +from archivers import TelethonArchiver, TelegramArchiver, TiktokArchiver, YoutubeDLArchiver, TwitterArchiver, WaybackArchiver, ArchiveResult +from storages import S3Storage from utils import GWorksheet, mkdir_if_not_exists +from configs import Config load_dotenv() -def update_sheet(gw, row, result: archivers.ArchiveResult): +def update_sheet(gw, row, result: ArchiveResult): cell_updates = [] row_values = gw.get_row(row) @@ -61,56 +63,56 @@ def expand_url(url): return url -def process_sheet(sheet, header=1, columns=GWorksheet.COLUMN_NAMES): - gc = gspread.service_account(filename='service_account.json') - sh = gc.open(sheet) +def process_sheet(c: Config, sheet, header=1, columns=GWorksheet.COLUMN_NAMES): + # gc = gspread.service_account(filename='service_account.json') + sh = c.gsheets_client.open(sheet) - s3_config = S3Config( - bucket=os.getenv('DO_BUCKET'), - region=os.getenv('DO_SPACES_REGION'), - key=os.getenv('DO_SPACES_KEY'), - secret=os.getenv('DO_SPACES_SECRET') - ) - telegram_config = archivers.TelegramConfig( - api_id=os.getenv('TELEGRAM_API_ID'), - api_hash=os.getenv('TELEGRAM_API_HASH') - ) + # s3_config = S3Config( + # bucket=os.getenv('DO_BUCKET'), + # region=os.getenv('DO_SPACES_REGION'), + # key=os.getenv('DO_SPACES_KEY'), + # secret=os.getenv('DO_SPACES_SECRET') + # ) + # telegram_config = archivers.TelegramConfig( + # api_id=os.getenv('TELEGRAM_API_ID'), + # api_hash=os.getenv('TELEGRAM_API_HASH') + # ) - options = webdriver.FirefoxOptions() - options.headless = True - options.set_preference('network.protocol-handler.external.tg', False) + # options = webdriver.FirefoxOptions() + # options.headless = True + # options.set_preference('network.protocol-handler.external.tg', False) - driver = webdriver.Firefox(options=options) - driver.set_window_size(1400, 2000) - driver.set_page_load_timeout(10) + # driver = webdriver.Firefox(options=options) + # driver.set_window_size(1400, 2000) + # driver.set_page_load_timeout(10) # loop through worksheets to check for ii, wks in enumerate(sh.worksheets()): - logger.info(f'Opening worksheet {ii}: "{wks.title}" header={header}') - gw = GWorksheet(wks, header_row=header, columns=columns) + logger.info(f'Opening worksheet {ii}: "{wks.title}" header={c.header}') + gw = GWorksheet(wks, header_row=c.header, columns=c.column_names) if not gw.col_exists('url'): logger.warning( - f'No "{columns["url"]}" column found, skipping worksheet {wks.title}') + f'No "{c.column_names["url"]}" column found, skipping worksheet {wks.title}') continue if not gw.col_exists('status'): logger.warning( - f'No "{columns["status"]}" column found, skipping worksheet {wks.title}') + f'No "{c.column_names["status"]}" column found, skipping worksheet {wks.title}') continue # archives will be in a folder 'doc_name/worksheet_name' - s3_config.folder = f'{sheet.replace(" ", "_")}/{wks.title.replace(" ", "_")}/' - s3_client = S3Storage(s3_config) + c.s3_config.folder = f'{sheet.replace(" ", "_")}/{wks.title.replace(" ", "_")}/' + s3_client = S3Storage(c.s3_config) # order matters, first to succeed excludes remaining active_archivers = [ - archivers.TelethonArchiver(s3_client, driver, telegram_config), - archivers.TelegramArchiver(s3_client, driver), - archivers.TiktokArchiver(s3_client, driver), - archivers.YoutubeDLArchiver(s3_client, driver), - archivers.TwitterArchiver(s3_client, driver), - archivers.WaybackArchiver(s3_client, driver) + TelethonArchiver(s3_client, c.webdriver, c.telegram_config), + TelegramArchiver(s3_client, c.webdriver), + TiktokArchiver(s3_client, c.webdriver), + YoutubeDLArchiver(s3_client, c.webdriver), + TwitterArchiver(s3_client, c.webdriver), + WaybackArchiver(s3_client, c.webdriver) ] # loop through rows in worksheet @@ -149,27 +151,18 @@ def process_sheet(sheet, header=1, columns=GWorksheet.COLUMN_NAMES): else: gw.set_cell(row, 'status', 'failed: no archiver') logger.success(f'Finshed worksheet {wks.title}') - driver.quit() def main(): - parser = argparse.ArgumentParser( - description='Automatically archive social media videos from a Google Sheets document') - parser.add_argument('--sheet', action='store', dest='sheet', help='the name of the google sheets document', required=True) - parser.add_argument('--header', action='store', dest='header', default=1, type=int, help='1-based index for the header row') - parser.add_argument('--private', action='store_true', help='Store content without public access permission') + c = Config() + c.parse() - for k, v in GWorksheet.COLUMN_NAMES.items(): - parser.add_argument(f'--col-{k}', action='store', dest=k, default=v, help=f'the name of the column to fill with {k} (defaults={v})') + logger.info(f'Opening document {c.sheet} for header {c.header}') - args = parser.parse_args() - config_columns = {k: getattr(args, k).lower() for k in GWorksheet.COLUMN_NAMES.keys()} - - logger.info(f'Opening document {args.sheet} for header {args.header}') - - mkdir_if_not_exists('tmp') - process_sheet(args.sheet, header=args.header, columns=config_columns) - shutil.rmtree('tmp') + mkdir_if_not_exists(c.tmp_folder) + process_sheet(c, c.sheet, header=c.header, columns=c.column_names) + shutil.rmtree(c.tmp_folder) + c.webdriver.quit() if __name__ == '__main__': From 0d65798308c608adde3355416c24623a5fab18cb Mon Sep 17 00:00:00 2001 From: msramalho <19508417+msramalho@users.noreply.github.com> Date: Mon, 9 May 2022 14:54:48 +0200 Subject: [PATCH 08/84] wip: configurations and logic --- Pipfile.lock | 278 +++++++++++++++++--------------- archivers/base_archiver.py | 7 +- archivers/telegram_archiver.py | 2 +- archivers/telethon_archiver.py | 9 +- archivers/tiktok_archiver.py | 2 +- archivers/youtubedl_archiver.py | 2 +- auto_archive.py | 36 +---- configs/config.py | 50 ++++-- storages/base_storage.py | 1 + storages/s3_storage.py | 41 ++++- 10 files changed, 239 insertions(+), 189 deletions(-) diff --git a/Pipfile.lock b/Pipfile.lock index c48a8ca..be3a10c 100644 --- a/Pipfile.lock +++ b/Pipfile.lock @@ -42,27 +42,27 @@ }, "beautifulsoup4": { "hashes": [ - "sha256:9a315ce70049920ea4572a4055bc4bd700c940521d36fc858205ad4fcde149bf", - "sha256:c23ad23c521d818955a4151a67d81580319d4bf548d3d49f4223ae041ff98891" + "sha256:58d5c3d29f5a36ffeb94f02f0d786cd53014cf9b3b3951d42e0080d8a9498d30", + "sha256:ad9aa55b65ef2808eb405f46cf74df7fcb7044d5cbc26487f96eb2ef2e436693" ], "index": "pypi", - "version": "==4.10.0" + "version": "==4.11.1" }, "boto3": { "hashes": [ - "sha256:76d5b90400c54b25278150768e946edf166acce2c1597c0ecfbebb1dbe9acf2c", - "sha256:7bb2e6506a6ad44d111dd20a5d510374b6958fe989b4ef887109c79d812f926f" + "sha256:441b619067cb205bfcd0e66fe085c16989ab65bd348823013e11bef991c00a5c", + "sha256:65e45029d234ff94ba8aa3bacb9df00fbbb2f1d9ee7fd1c2e40f4815d12ec3f5" ], "index": "pypi", - "version": "==1.21.19" + "version": "==1.22.9" }, "botocore": { "hashes": [ - "sha256:5ed2be0e413961134f4c17eab16396d41a5b4b73a637588260c04d20806d52ea", - "sha256:d0d77bce152ca51f3c2cd0f9bf05cb3b623e719406ad58b4c20444e237fe82eb" + "sha256:71962de55b053a0124a0514155f4cdcf0bce81795ffc2bd6e000c1594e99125a", + "sha256:a1d26b95aaa5b2e126df74b223d774fae7e6597bb55c363782178f5b87f0cad3" ], "markers": "python_version >= '3.6'", - "version": "==1.24.19" + "version": "==1.25.9" }, "brotli": { "hashes": [ @@ -219,11 +219,11 @@ }, "click": { "hashes": [ - "sha256:6a7a62563bbfabfda3a38f3023a1db4a35978c0abd76f6c9605ecd6554d6d9b1", - "sha256:8458d7b1287c5fb128c90e23381cf99dcde74beaf6c7ff6384ce84d6fe090adb" + "sha256:7682dc8afb30297001674575ea00d1814d808d6a36af415a82bd481d37ba7b8e", + "sha256:bb4d8133cb15a609f44e8213d9b391b0809795062913b383c62be0ee95b1db48" ], - "markers": "python_version >= '3.6'", - "version": "==8.0.4" + "markers": "python_version >= '3.7'", + "version": "==8.1.3" }, "cloudscraper": { "hashes": [ @@ -234,36 +234,38 @@ }, "cryptography": { "hashes": [ - "sha256:0a817b961b46894c5ca8a66b599c745b9a3d9f822725221f0e0fe49dc043a3a3", - "sha256:2d87cdcb378d3cfed944dac30596da1968f88fb96d7fc34fdae30a99054b2e31", - "sha256:30ee1eb3ebe1644d1c3f183d115a8c04e4e603ed6ce8e394ed39eea4a98469ac", - "sha256:391432971a66cfaf94b21c24ab465a4cc3e8bf4a939c1ca5c3e3a6e0abebdbcf", - "sha256:39bdf8e70eee6b1c7b289ec6e5d84d49a6bfa11f8b8646b5b3dfe41219153316", - "sha256:4caa4b893d8fad33cf1964d3e51842cd78ba87401ab1d2e44556826df849a8ca", - "sha256:53e5c1dc3d7a953de055d77bef2ff607ceef7a2aac0353b5d630ab67f7423638", - "sha256:596f3cd67e1b950bc372c33f1a28a0692080625592ea6392987dba7f09f17a94", - "sha256:5d59a9d55027a8b88fd9fd2826c4392bd487d74bf628bb9d39beecc62a644c12", - "sha256:6c0c021f35b421ebf5976abf2daacc47e235f8b6082d3396a2fe3ccd537ab173", - "sha256:73bc2d3f2444bcfeac67dd130ff2ea598ea5f20b40e36d19821b4df8c9c5037b", - "sha256:74d6c7e80609c0f4c2434b97b80c7f8fdfaa072ca4baab7e239a15d6d70ed73a", - "sha256:7be0eec337359c155df191d6ae00a5e8bbb63933883f4f5dffc439dac5348c3f", - "sha256:94ae132f0e40fe48f310bba63f477f14a43116f05ddb69d6fa31e93f05848ae2", - "sha256:bb5829d027ff82aa872d76158919045a7c1e91fbf241aec32cb07956e9ebd3c9", - "sha256:ca238ceb7ba0bdf6ce88c1b74a87bffcee5afbfa1e41e173b1ceb095b39add46", - "sha256:ca28641954f767f9822c24e927ad894d45d5a1e501767599647259cbf030b903", - "sha256:e0344c14c9cb89e76eb6a060e67980c9e35b3f36691e15e1b7a9e58a0a6c6dc3", - "sha256:ebc15b1c22e55c4d5566e3ca4db8689470a0ca2babef8e3a9ee057a8b82ce4b1", - "sha256:ec63da4e7e4a5f924b90af42eddf20b698a70e58d86a72d943857c4c6045b3ee" + "sha256:093cb351031656d3ee2f4fa1be579a8c69c754cf874206be1d4cf3b542042804", + "sha256:0cc20f655157d4cfc7bada909dc5cc228211b075ba8407c46467f63597c78178", + "sha256:1b9362d34363f2c71b7853f6251219298124aa4cc2075ae2932e64c91a3e2717", + "sha256:1f3bfbd611db5cb58ca82f3deb35e83af34bb8cf06043fa61500157d50a70982", + "sha256:2bd1096476aaac820426239ab534b636c77d71af66c547b9ddcd76eb9c79e004", + "sha256:31fe38d14d2e5f787e0aecef831457da6cec68e0bb09a35835b0b44ae8b988fe", + "sha256:3b8398b3d0efc420e777c40c16764d6870bcef2eb383df9c6dbb9ffe12c64452", + "sha256:3c81599befb4d4f3d7648ed3217e00d21a9341a9a688ecdd615ff72ffbed7336", + "sha256:419c57d7b63f5ec38b1199a9521d77d7d1754eb97827bbb773162073ccd8c8d4", + "sha256:46f4c544f6557a2fefa7ac8ac7d1b17bf9b647bd20b16decc8fbcab7117fbc15", + "sha256:471e0d70201c069f74c837983189949aa0d24bb2d751b57e26e3761f2f782b8d", + "sha256:59b281eab51e1b6b6afa525af2bd93c16d49358404f814fe2c2410058623928c", + "sha256:731c8abd27693323b348518ed0e0705713a36d79fdbd969ad968fbef0979a7e0", + "sha256:95e590dd70642eb2079d280420a888190aa040ad20f19ec8c6e097e38aa29e06", + "sha256:a68254dd88021f24a68b613d8c51d5c5e74d735878b9e32cc0adf19d1f10aaf9", + "sha256:a7d5137e556cc0ea418dca6186deabe9129cee318618eb1ffecbd35bee55ddc1", + "sha256:aeaba7b5e756ea52c8861c133c596afe93dd716cbcacae23b80bc238202dc023", + "sha256:dc26bb134452081859aa21d4990474ddb7e863aa39e60d1592800a8865a702de", + "sha256:e53258e69874a306fcecb88b7534d61820db8a98655662a3dd2ec7f1afd9132f", + "sha256:ef15c2df7656763b4ff20a9bc4381d8352e6640cfeb95c2972c38ef508e75181", + "sha256:f224ad253cc9cea7568f49077007d2263efa57396a2f2f78114066fd54b5c68e", + "sha256:f8ec91983e638a9bcd75b39f1396e5c0dc2330cbd9ce4accefe68717e6779e0a" ], - "version": "==36.0.1" + "version": "==37.0.2" }, "faker": { "hashes": [ - "sha256:66db859b6abe376d02e805ad81eb8dcfce38f0945f17ee7cdf74ed349985ea52", - "sha256:fe969607836ce7100e38b88dcb598aacb733d895e6e9401894dd603e35623000" + "sha256:0301ace8365d98f3d0bf6e9a40200c8548e845d3812402ae1daf589effe3fb01", + "sha256:b1903db92175d78051858128ada397c7dc76f376f6967975419da232b3ebd429" ], "markers": "python_version >= '3.6'", - "version": "==13.3.2" + "version": "==13.7.0" }, "ffmpeg-python": { "hashes": [ @@ -283,11 +285,11 @@ }, "flask": { "hashes": [ - "sha256:59da8a3170004800a2837844bfa84d49b022550616070f7cb1a659682b2e7c9f", - "sha256:e1120c228ca2f553b470df4a5fa927ab66258467526069981b3eb0a91902687d" + "sha256:315ded2ddf8a6281567edb27393010fe3406188bafbfe65a3339d5787d89e477", + "sha256:fad5b446feb0d6db6aec0c3184d16a8c1f6c3e464b511649c8918a9be100b4fe" ], - "markers": "python_version >= '3.6'", - "version": "==2.0.3" + "markers": "python_version >= '3.7'", + "version": "==2.1.2" }, "future": { "hashes": [ @@ -298,11 +300,11 @@ }, "google-auth": { "hashes": [ - "sha256:218ca03d7744ca0c8b6697b6083334be7df49b7bf76a69d555962fd1a7657b5f", - "sha256:ad160fc1ea8f19e331a16a14a79f3d643d813a69534ba9611d2c80dc10439dad" + "sha256:1ba4938e032b73deb51e59c4656a00e0939cf0b1112575099f136babb4563312", + "sha256:349ac49b18b01019453cc99c11c92ed772739778c92f184002b7ab3a5b7ac77d" ], "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4, 3.5'", - "version": "==2.6.0" + "version": "==2.6.6" }, "google-auth-oauthlib": { "hashes": [ @@ -314,11 +316,11 @@ }, "gspread": { "hashes": [ - "sha256:05297b49587b5e89c2a0aa39967f43e5b7f170b62c11ddd43214baa1085131a8", - "sha256:25173ac081469cf9d621514c6576c6cf46f39c825f178b8cb9e78374a637b0bf" + "sha256:319766d90db05056293f7ee0ad2b35503a1a40683a75897a2922398cd2016283", + "sha256:c719e1c024a2a6f3b7d818fbe07c3886b26fd6504b64d1b1359cf242968213cd" ], "index": "pypi", - "version": "==5.2.0" + "version": "==5.3.2" }, "h11": { "hashes": [ @@ -336,29 +338,37 @@ "markers": "python_version >= '3'", "version": "==3.3" }, + "importlib-metadata": { + "hashes": [ + "sha256:1208431ca90a8cca1a6b8af391bb53c1a2db74e5d1cef6ddced95d4b2062edc6", + "sha256:ea4c597ebf37142f827b8f39299579e31685c31d3a438b59f469406afd0f2539" + ], + "markers": "python_version < '3.10'", + "version": "==4.11.3" + }, "itsdangerous": { "hashes": [ - "sha256:7b7d3023cd35d9cb0c1fd91392f8c95c6fa02c59bf8ad64b8849be3401b95afb", - "sha256:935642cd4b987cdbee7210080004033af76306757ff8b4c0a506a4b6e06f02cf" + "sha256:2c2349112351b88699d8d4b6b075022c0808887cb7ad10069318a8b0bc88db44", + "sha256:5dbbc68b317e5e42f327f9021763545dc3fc3bfe22e6deb96aaf1fc38874156a" ], "markers": "python_version >= '3.7'", - "version": "==2.1.1" + "version": "==2.1.2" }, "jinja2": { "hashes": [ - "sha256:077ce6014f7b40d03b47d1f1ca4b0fc8328a692bd284016f806ed0eaca390ad8", - "sha256:611bb273cd68f3b993fabdc4064fc858c5b47a973cb5aa7999ec1ba405c87cd7" + "sha256:31351a702a408a9e7595a8fc6150fc3f43bb6bf7e319770cbc0db9df9437e852", + "sha256:6088930bfe239f0e6710546ab9c19c9ef35e29792895fed6e6e31a023a182a61" ], - "markers": "python_version >= '3.6'", - "version": "==3.0.3" + "markers": "python_version >= '3.7'", + "version": "==3.1.2" }, "jmespath": { "hashes": [ - "sha256:b85d0567b8666149a93172712e68920734333c0ce7e89b78b3e987f71e5ed4f9", - "sha256:cdf6525904cc597730141d61b36f2e4b8ecc257c420fa2f4549bac2c2d0cb72f" + "sha256:a490e280edd1f57d6de88636992d05b71e97d69a26a19f058ecf7d304474bf5e", + "sha256:e8dcd576ed616f14ec02eed0005c85973b5890083313860136657e24784e4c04" ], - "markers": "python_version >= '2.6' and python_version not in '3.0, 3.1, 3.2, 3.3'", - "version": "==0.10.0" + "markers": "python_version >= '3.7'", + "version": "==1.0.0" }, "loguru": { "hashes": [ @@ -596,11 +606,11 @@ }, "pyparsing": { "hashes": [ - "sha256:18ee9022775d270c55187733956460083db60b37d0d0fb357445f3094eed3eea", - "sha256:a6c06a88f252e6c322f65faf8f418b16213b51bdfaece0524c1c1bc30c63c484" + "sha256:7bf433498c016c4314268d95df76c81b842a4cb2b276fa3312cfb1e1d85f6954", + "sha256:ef7b523f6356f763771559412c0d7134753f037822dad1b16945b7b846f7ad06" ], - "markers": "python_version >= '3.6'", - "version": "==3.0.7" + "markers": "python_full_version >= '3.6.8'", + "version": "==3.0.8" }, "pysocks": { "hashes": [ @@ -620,14 +630,13 @@ }, "python-dotenv": { "hashes": [ - "sha256:32b2bdc1873fd3a3c346da1c6db83d0053c3c62f28f1f38516070c4c8971b1d3", - "sha256:a5de49a31e953b45ff2d2fd434bbc2670e8db5273606c1e737cc6b93eff3655f" + "sha256:b7e3b04a59693c42c36f9ab1cc2acc46fa5df8c78e178fc33a8d4cd05c8d498f", + "sha256:d92a187be61fe482e4fd675b6d52200e7be63a12b724abbf931a40ce4fa92938" ], "index": "pypi", - "version": "==0.19.2" + "version": "==0.20.0" }, "requests": { - "extras": [], "hashes": [ "sha256:68d7c56fd5a8999887728ef304a6d12edc7be74f1cfa47714fc8b414525c9a61", "sha256:f22fa1e554c9ddfd16e6e41ac79759e17be9e492b3587efa038054674760e72d" @@ -668,10 +677,10 @@ }, "selenium": { "hashes": [ - "sha256:14d28a628c831c105d38305c881c9c7847199bfd728ec84240c5e86fa1c9bd5a" + "sha256:866b6dd6c459210662bff922ee7c33162d21920fbf6811e8e5a52be3866a687f" ], "index": "pypi", - "version": "==4.1.3" + "version": "==4.1.5" }, "six": { "hashes": [ @@ -706,11 +715,11 @@ }, "soupsieve": { "hashes": [ - "sha256:1a3cca2617c6b38c0343ed661b1fa5de5637f257d4fe22bd9f1338010a1efefb", - "sha256:b8d49b1cd4f037c7082a9683dfa1801aa2597fb11c3a1155b7a5b94829b4f1f9" + "sha256:3b2503d3c7084a42b1ebd08116e5f81aadfaea95863628c80a3b774a11b7c759", + "sha256:fc53893b3da2c33de295667a0e19f078c14bf86544af307354de5fcf12a3f30d" ], "markers": "python_version >= '3.6'", - "version": "==2.3.1" + "version": "==2.3.2.post1" }, "telethon": { "hashes": [ @@ -741,75 +750,74 @@ "version": "==0.9.2" }, "urllib3": { - "extras": [], "hashes": [ - "sha256:000ca7f471a233c2251c6c7023ee85305721bfdf18621ebff4fd17a8653427ed", - "sha256:0e7c33d9a63e7ddfcb86780aac87befc2fbddf46c58dbb487e0855f7ceec283c" + "sha256:44ece4d53fb1706f667c9bd1c648f5469a2ec925fcf3a776667042d645472c14", + "sha256:aabaf16477806a5e1dd19aa41f8c2b7950dd3c746362d7e3223dbe6de6ac448e" ], "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4' and python_version < '4'", - "version": "==1.26.8" + "version": "==1.26.9" }, "websockets": { "hashes": [ - "sha256:038afef2a05893578d10dadbdbb5f112bd115c46347e1efe99f6a356ff062138", - "sha256:05f6e9757017270e7a92a2975e2ae88a9a582ffc4629086fd6039aa80e99cd86", - "sha256:0b66421f9f13d4df60cd48ab977ed2c2b6c9147ae1a33caf5a9f46294422fda1", - "sha256:0cd02f36d37e503aca88ab23cc0a1a0e92a263d37acf6331521eb38040dcf77b", - "sha256:0f73cb2526d6da268e86977b2c4b58f2195994e53070fe567d5487c6436047e6", - "sha256:117383d0a17a0dda349f7a8790763dde75c1508ff8e4d6e8328b898b7df48397", - "sha256:1c1f3b18c8162e3b09761d0c6a0305fd642934202541cc511ef972cb9463261e", - "sha256:1c9031e90ebfc486e9cdad532b94004ade3aa39a31d3c46c105bb0b579cd2490", - "sha256:2349fa81b6b959484bb2bda556ccb9eb70ba68987646a0f8a537a1a18319fb03", - "sha256:24b879ba7db12bb525d4e58089fcbe6a3df3ce4666523183654170e86d372cbe", - "sha256:2aa9b91347ecd0412683f28aabe27f6bad502d89bd363b76e0a3508b1596402e", - "sha256:56d48eebe9e39ce0d68701bce3b21df923aa05dcc00f9fd8300de1df31a7c07c", - "sha256:5a38a0175ae82e4a8c4bac29fc01b9ee26d7d5a614e5ee11e7813c68a7d938ce", - "sha256:5b04270b5613f245ec84bb2c6a482a9d009aefad37c0575f6cda8499125d5d5c", - "sha256:6193bbc1ee63aadeb9a4d81de0e19477401d150d506aee772d8380943f118186", - "sha256:669e54228a4d9457abafed27cbf0e2b9f401445c4dfefc12bf8e4db9751703b8", - "sha256:6a009eb551c46fd79737791c0c833fc0e5b56bcd1c3057498b262d660b92e9cd", - "sha256:71a4491cfe7a9f18ee57d41163cb6a8a3fa591e0f0564ca8b0ed86b2a30cced4", - "sha256:7b38a5c9112e3dbbe45540f7b60c5204f49b3cb501b40950d6ab34cd202ab1d0", - "sha256:7bb9d8a6beca478c7e9bdde0159bd810cc1006ad6a7cb460533bae39da692ca2", - "sha256:82bc33db6d8309dc27a3bee11f7da2288ad925fcbabc2a4bb78f7e9c56249baf", - "sha256:8351c3c86b08156337b0e4ece0e3c5ec3e01fcd14e8950996832a23c99416098", - "sha256:8beac786a388bb99a66c3be4ab0fb38273c0e3bc17f612a4e0a47c4fc8b9c045", - "sha256:97950c7c844ec6f8d292440953ae18b99e3a6a09885e09d20d5e7ecd9b914cf8", - "sha256:98f57b3120f8331cd7440dbe0e776474f5e3632fdaa474af1f6b754955a47d71", - "sha256:9ca2ca05a4c29179f06cf6727b45dba5d228da62623ec9df4184413d8aae6cb9", - "sha256:a03a25d95cc7400bd4d61a63460b5d85a7761c12075ee2f51de1ffe73aa593d3", - "sha256:a10c0c1ee02164246f90053273a42d72a3b2452a7e7486fdae781138cf7fbe2d", - "sha256:a72b92f96e5e540d5dda99ee3346e199ade8df63152fa3c737260da1730c411f", - "sha256:ac081aa0307f263d63c5ff0727935c736c8dad51ddf2dc9f5d0c4759842aefaa", - "sha256:b22bdc795e62e71118b63e14a08bacfa4f262fd2877de7e5b950f5ac16b0348f", - "sha256:b4059e2ccbe6587b6dc9a01db5fc49ead9a884faa4076eea96c5ec62cb32f42a", - "sha256:b7fe45ae43ac814beb8ca09d6995b56800676f2cfa8e23f42839dc69bba34a42", - "sha256:bef03a51f9657fb03d8da6ccd233fe96e04101a852f0ffd35f5b725b28221ff3", - "sha256:bffc65442dd35c473ca9790a3fa3ba06396102a950794f536783f4b8060af8dd", - "sha256:c21a67ab9a94bd53e10bba21912556027fea944648a09e6508415ad14e37c325", - "sha256:c67d9cacb3f6537ca21e9b224d4fd08481538e43bcac08b3d93181b0816def39", - "sha256:c6e56606842bb24e16e36ae7eb308d866b4249cf0be8f63b212f287eeb76b124", - "sha256:cb316b87cbe3c0791c2ad92a5a36bf6adc87c457654335810b25048c1daa6fd5", - "sha256:cef40a1b183dcf39d23b392e9dd1d9b07ab9c46aadf294fff1350fb79146e72b", - "sha256:cf931c33db9c87c53d009856045dd524e4a378445693382a920fa1e0eb77c36c", - "sha256:d4d110a84b63c5cfdd22485acc97b8b919aefeecd6300c0c9d551e055b9a88ea", - "sha256:d5396710f86a306cf52f87fd8ea594a0e894ba0cc5a36059eaca3a477dc332aa", - "sha256:f09f46b1ff6d09b01c7816c50bd1903cf7d02ebbdb63726132717c2fcda835d5", - "sha256:f14bd10e170abc01682a9f8b28b16e6f20acf6175945ef38db6ffe31b0c72c3f", - "sha256:f5c335dc0e7dc271ef36df3f439868b3c790775f345338c2f61a562f1074187b", - "sha256:f8296b8408ec6853b26771599990721a26403e62b9de7e50ac0a056772ac0b5e", - "sha256:fa35c5d1830d0fb7b810324e9eeab9aa92e8f273f11fdbdc0741dcded6d72b9f" + "sha256:07cdc0a5b2549bcfbadb585ad8471ebdc7bdf91e32e34ae3889001c1c106a6af", + "sha256:210aad7fdd381c52e58777560860c7e6110b6174488ef1d4b681c08b68bf7f8c", + "sha256:28dd20b938a57c3124028680dc1600c197294da5db4292c76a0b48efb3ed7f76", + "sha256:2f94fa3ae454a63ea3a19f73b95deeebc9f02ba2d5617ca16f0bbdae375cda47", + "sha256:31564a67c3e4005f27815634343df688b25705cccb22bc1db621c781ddc64c69", + "sha256:347974105bbd4ea068106ec65e8e8ebd86f28c19e529d115d89bd8cc5cda3079", + "sha256:379e03422178436af4f3abe0aa8f401aa77ae2487843738542a75faf44a31f0c", + "sha256:3eda1cb7e9da1b22588cefff09f0951771d6ee9fa8dbe66f5ae04cc5f26b2b55", + "sha256:51695d3b199cd03098ae5b42833006a0f43dc5418d3102972addc593a783bc02", + "sha256:54c000abeaff6d8771a4e2cef40900919908ea7b6b6a30eae72752607c6db559", + "sha256:5b936bf552e4f6357f5727579072ff1e1324717902127ffe60c92d29b67b7be3", + "sha256:6075fd24df23133c1b078e08a9b04a3bc40b31a8def4ee0b9f2c8865acce913e", + "sha256:661f641b44ed315556a2fa630239adfd77bd1b11cb0b9d96ed8ad90b0b1e4978", + "sha256:6ea6b300a6bdd782e49922d690e11c3669828fe36fc2471408c58b93b5535a98", + "sha256:6ed1d6f791eabfd9808afea1e068f5e59418e55721db8b7f3bfc39dc831c42ae", + "sha256:7934e055fd5cd9dee60f11d16c8d79c4567315824bacb1246d0208a47eca9755", + "sha256:7ab36e17af592eec5747c68ef2722a74c1a4a70f3772bc661079baf4ae30e40d", + "sha256:7f6d96fdb0975044fdd7953b35d003b03f9e2bcf85f2d2cf86285ece53e9f991", + "sha256:83e5ca0d5b743cde3d29fda74ccab37bdd0911f25bd4cdf09ff8b51b7b4f2fa1", + "sha256:85506b3328a9e083cc0a0fb3ba27e33c8db78341b3eb12eb72e8afd166c36680", + "sha256:8af75085b4bc0b5c40c4a3c0e113fa95e84c60f4ed6786cbb675aeb1ee128247", + "sha256:8b1359aba0ff810d5830d5ab8e2c4a02bebf98a60aa0124fb29aa78cfdb8031f", + "sha256:8fbd7d77f8aba46d43245e86dd91a8970eac4fb74c473f8e30e9c07581f852b2", + "sha256:907e8247480f287aa9bbc9391bd6de23c906d48af54c8c421df84655eef66af7", + "sha256:93d5ea0b5da8d66d868b32c614d2b52d14304444e39e13a59566d4acb8d6e2e4", + "sha256:97bc9d41e69a7521a358f9b8e44871f6cdeb42af31815c17aed36372d4eec667", + "sha256:994cdb1942a7a4c2e10098d9162948c9e7b235df755de91ca33f6e0481366fdb", + "sha256:a141de3d5a92188234afa61653ed0bbd2dde46ad47b15c3042ffb89548e77094", + "sha256:a1e15b230c3613e8ea82c9fc6941b2093e8eb939dd794c02754d33980ba81e36", + "sha256:aad5e300ab32036eb3fdc350ad30877210e2f51bceaca83fb7fef4d2b6c72b79", + "sha256:b529fdfa881b69fe563dbd98acce84f3e5a67df13de415e143ef053ff006d500", + "sha256:b9c77f0d1436ea4b4dc089ed8335fa141e6a251a92f75f675056dac4ab47a71e", + "sha256:bb621ec2dbbbe8df78a27dbd9dd7919f9b7d32a73fafcb4d9252fc4637343582", + "sha256:c7250848ce69559756ad0086a37b82c986cd33c2d344ab87fea596c5ac6d9442", + "sha256:c8d1d14aa0f600b5be363077b621b1b4d1eb3fbf90af83f9281cda668e6ff7fd", + "sha256:d1655a6fc7aecd333b079d00fb3c8132d18988e47f19740c69303bf02e9883c6", + "sha256:d6353ba89cfc657a3f5beabb3b69be226adbb5c6c7a66398e17809b0ce3c4731", + "sha256:da4377904a3379f0c1b75a965fff23b28315bcd516d27f99a803720dfebd94d4", + "sha256:e49ea4c1a9543d2bd8a747ff24411509c29e4bdcde05b5b0895e2120cb1a761d", + "sha256:e4e08305bfd76ba8edab08dcc6496f40674f44eb9d5e23153efa0a35750337e8", + "sha256:e6fa05a680e35d0fcc1470cb070b10e6fe247af54768f488ed93542e71339d6f", + "sha256:e7e6f2d6fd48422071cc8a6f8542016f350b79cc782752de531577d35e9bd677", + "sha256:e904c0381c014b914136c492c8fa711ca4cced4e9b3d110e5e7d436d0fc289e8", + "sha256:ec2b0ab7edc8cd4b0eb428b38ed89079bdc20c6bdb5f889d353011038caac2f9", + "sha256:ef5ce841e102278c1c2e98f043db99d6755b1c58bde475516aef3a008ed7f28e", + "sha256:f351c7d7d92f67c0609329ab2735eee0426a03022771b00102816a72715bb00b", + "sha256:fab7c640815812ed5f10fbee7abbf58788d602046b7bb3af9b1ac753a6d5e916", + "sha256:fc06cc8073c8e87072138ba1e431300e2d408f054b27047d047b549455066ff4" ], "markers": "python_version >= '3.7'", - "version": "==10.2" + "version": "==10.3" }, "werkzeug": { "hashes": [ - "sha256:1421ebfc7648a39a5c58c601b154165d05cf47a3cd0ccb70857cbdacf6c8f2b8", - "sha256:b863f8ff057c522164b6067c9e28b041161b4be5ba4d0daceeaa50a163822d3c" + "sha256:1ce08e8093ed67d638d63879fd1ba3735817f7a80de3674d293f5984f25fb6e6", + "sha256:72a4b735692dd3135217911cbeaa1be5fa3f62bffb8745c5215420a03dc55255" ], - "markers": "python_version >= '3.6'", - "version": "==2.0.3" + "markers": "python_version >= '3.7'", + "version": "==2.1.2" }, "wsproto": { "hashes": [ @@ -821,11 +829,19 @@ }, "yt-dlp": { "hashes": [ - "sha256:05179f0f2c34f06910003bb9f80af68ff798b072ca0f826c0e6704a3fbd5b306", - "sha256:68546578c18e6ce87450b53769d5d5b7f5a23e5209784976db6c7ccbf7954b21" + "sha256:6edefe326b1e1478fdbe627a66203e5248a6b0dd50c101e682cf700ab70cdf72", + "sha256:8758d016509d4574b90fbde975aa70adaef71ed5e7a195141588f6d6945205ba" ], "index": "pypi", - "version": "==2022.3.8.2" + "version": "==2022.4.8" + }, + "zipp": { + "hashes": [ + "sha256:56bf8aadb83c24db6c4b577e13de374ccfb67da2078beba1d037c17980bf43ad", + "sha256:c4f6e5bbf48e74f7a38e7cc5b0480ff42b0ae5178957d564d18932525d5cf099" + ], + "markers": "python_version >= '3.7'", + "version": "==3.8.0" } }, "develop": {} diff --git a/archivers/base_archiver.py b/archivers/base_archiver.py index 7ab5a9c..aa85444 100644 --- a/archivers/base_archiver.py +++ b/archivers/base_archiver.py @@ -30,6 +30,7 @@ class ArchiveResult: class Archiver(ABC): name = "default" + TMP_FOLDER = "tmp/" def __init__(self, storage: Storage, driver): self.storage = storage @@ -60,7 +61,7 @@ class Archiver(ABC): page += f"" page_key = self.get_key(urlparse(url).path.replace("/", "_") + ".html") - page_filename = 'tmp/' + page_key + page_filename = Archiver.TMP_FOLDER + page_key page_cdn = self.storage.get_cdn_url(page_key) with open(page_filename, "w") as f: @@ -85,7 +86,7 @@ class Archiver(ABC): if '.' not in path: key += '.jpg' - filename = 'tmp/' + key + filename = Archiver.TMP_FOLDER + key d = requests.get(media_url, headers=headers) with open(filename, 'wb') as f: @@ -126,7 +127,7 @@ class Archiver(ABC): def get_screenshot(self, url): key = self.get_key(urlparse(url).path.replace( "/", "_") + datetime.datetime.utcnow().isoformat().replace(" ", "_") + ".png") - filename = 'tmp/' + key + filename = Archiver.TMP_FOLDER + key try: self.driver.get(url) diff --git a/archivers/telegram_archiver.py b/archivers/telegram_archiver.py index 5a7f63c..5c586e6 100644 --- a/archivers/telegram_archiver.py +++ b/archivers/telegram_archiver.py @@ -52,7 +52,7 @@ class TelegramArchiver(Archiver): video_id = video_url.split('/')[-1].split('?')[0] key = self.get_key(video_id) - filename = 'tmp/' + key + filename = Archiver.TMP_FOLDER + key cdn_url = self.storage.get_cdn_url(key) if check_if_exists and self.storage.exists(key): diff --git a/archivers/telethon_archiver.py b/archivers/telethon_archiver.py index 08b7dec..69bd49a 100644 --- a/archivers/telethon_archiver.py +++ b/archivers/telethon_archiver.py @@ -9,7 +9,6 @@ from telethon.sync import TelegramClient from configs import TelegramConfig - class TelethonArchiver(Archiver): name = "telethon" link_pattern = re.compile(r"https:\/\/t\.me(\/c){0,1}\/(.+)\/(.+)") @@ -71,8 +70,8 @@ class TelethonArchiver(Archiver): message = post.message for mp in media_posts: if len(mp.message) > len(message): message = mp.message - filename = self.client.download_media(mp.media, f'tmp/{chat}_{group_id}/{mp.id}') - key = filename.split('tmp/')[1] + filename = self.client.download_media(mp.media, f'{Archiver.TMP_FOLDER}{chat}_{group_id}/{mp.id}') + key = filename.split(Archiver.TMP_FOLDER)[1] self.storage.upload(filename, key) hash = self.get_hash(filename) cdn_url = self.storage.get_cdn_url(key) @@ -84,8 +83,8 @@ class TelethonArchiver(Archiver): return ArchiveResult(status=status, cdn_url=page_cdn, title=post.message, timestamp=post.date, hash=page_hash, screenshot=screenshot) elif len(media_posts) == 1: key = self.get_key(f'{chat}_{post_id}') - filename = self.client.download_media(post.media, f'tmp/{key}') - key = filename.split('tmp/')[1].replace(" ", "") + filename = self.client.download_media(post.media, f'{Archiver.TMP_FOLDER}{key}') + key = filename.split(Archiver.TMP_FOLDER)[1].replace(" ", "") self.storage.upload(filename, key) hash = self.get_hash(filename) cdn_url = self.storage.get_cdn_url(key) diff --git a/archivers/tiktok_archiver.py b/archivers/tiktok_archiver.py index 6b5116f..902e808 100644 --- a/archivers/tiktok_archiver.py +++ b/archivers/tiktok_archiver.py @@ -18,7 +18,7 @@ class TiktokArchiver(Archiver): info = tiktok_downloader.info_post(url) key = self.get_key(f'{info.id}.mp4') cdn_url = self.storage.get_cdn_url(key) - filename = 'tmp/' + key + filename = Archiver.TMP_FOLDER + key if check_if_exists and self.storage.exists(key): status = 'already archived' diff --git a/archivers/youtubedl_archiver.py b/archivers/youtubedl_archiver.py index 6e626b5..493b792 100644 --- a/archivers/youtubedl_archiver.py +++ b/archivers/youtubedl_archiver.py @@ -9,7 +9,7 @@ from .base_archiver import Archiver, ArchiveResult class YoutubeDLArchiver(Archiver): name = "youtube_dl" - ydl_opts = {'outtmpl': 'tmp/%(id)s.%(ext)s', 'quiet': False} + ydl_opts = {'outtmpl': f'{Archiver.TMP_FOLDER}%(id)s.%(ext)s', 'quiet': False} def download(self, url, check_if_exists=False): netloc = self.get_netloc(url) diff --git a/auto_archive.py b/auto_archive.py index 6fc41f1..af081ad 100644 --- a/auto_archive.py +++ b/auto_archive.py @@ -64,28 +64,8 @@ def expand_url(url): def process_sheet(c: Config, sheet, header=1, columns=GWorksheet.COLUMN_NAMES): - # gc = gspread.service_account(filename='service_account.json') sh = c.gsheets_client.open(sheet) - # s3_config = S3Config( - # bucket=os.getenv('DO_BUCKET'), - # region=os.getenv('DO_SPACES_REGION'), - # key=os.getenv('DO_SPACES_KEY'), - # secret=os.getenv('DO_SPACES_SECRET') - # ) - # telegram_config = archivers.TelegramConfig( - # api_id=os.getenv('TELEGRAM_API_ID'), - # api_hash=os.getenv('TELEGRAM_API_HASH') - # ) - - # options = webdriver.FirefoxOptions() - # options.headless = True - # options.set_preference('network.protocol-handler.external.tg', False) - - # driver = webdriver.Firefox(options=options) - # driver.set_window_size(1400, 2000) - # driver.set_page_load_timeout(10) - # loop through worksheets to check for ii, wks in enumerate(sh.worksheets()): logger.info(f'Opening worksheet {ii}: "{wks.title}" header={c.header}') @@ -102,17 +82,17 @@ def process_sheet(c: Config, sheet, header=1, columns=GWorksheet.COLUMN_NAMES): continue # archives will be in a folder 'doc_name/worksheet_name' - c.s3_config.folder = f'{sheet.replace(" ", "_")}/{wks.title.replace(" ", "_")}/' - s3_client = S3Storage(c.s3_config) + c.set_folder(f'{sheet.replace(" ", "_")}/{wks.title.replace(" ", "_")}/') + storage = c.get_storage() # order matters, first to succeed excludes remaining active_archivers = [ - TelethonArchiver(s3_client, c.webdriver, c.telegram_config), - TelegramArchiver(s3_client, c.webdriver), - TiktokArchiver(s3_client, c.webdriver), - YoutubeDLArchiver(s3_client, c.webdriver), - TwitterArchiver(s3_client, c.webdriver), - WaybackArchiver(s3_client, c.webdriver) + TelethonArchiver(storage, c.webdriver, c.telegram_config), + TelegramArchiver(storage, c.webdriver), + TiktokArchiver(storage, c.webdriver), + YoutubeDLArchiver(storage, c.webdriver), + TwitterArchiver(storage, c.webdriver), + WaybackArchiver(storage, c.webdriver) ] # loop through rows in worksheet diff --git a/configs/config.py b/configs/config.py index b697e13..6e68310 100644 --- a/configs/config.py +++ b/configs/config.py @@ -3,19 +3,29 @@ import argparse, json import gspread from loguru import logger from selenium import webdriver +from storages.local_storage import LocalStorage from utils.gworksheet import GWorksheet -from storages import S3Config +from storages import S3Config, S3Storage from .wayback_config import WaybackConfig from .telegram_config import TelegramConfig +from archivers import Archiver + class Config: """ Controls the current execution parameters and manages API configurations + Usage: + c = Config() # initializes the argument parser + c.parse() # parses the values and initializes the Services and API clients + # you can then access the Services and APIs like + c.s3_config + """ def __init__(self): self.parser = self.get_argument_parser() + self.folder = "" def parse(self): self.args = self.parser.parse_args() @@ -35,7 +45,8 @@ class Config: assert self.sheet is not None, "'sheet' must be provided either through command line or configuration file" self.header = int(getattr(self.args, "header") or execution.get("header", 1)) - self.tmp_folder = execution.get("tmp_folder", "tmp/") + self.tmp_folder = execution.get("tmp_folder", Archiver.TMP_FOLDER) + Archiver.TMP_FOLDER = self.tmp_folder self.storage = execution.get("storage", "s3") @@ -57,9 +68,10 @@ class Config: self.webdriver.set_window_size(1400, 2000) self.webdriver.set_page_load_timeout(self.selenium_timeout) + secrets = self.config.get("secrets", {}) # APIs and service configurations - if "s3" in self.config: - s3 = self.config["s3"] + if "s3" in secrets: + s3 = secrets["s3"] self.s3_config = S3Config( bucket=s3["bucket"], region=s3["region"], @@ -69,29 +81,32 @@ class Config: self.s3_config.private = getattr(self.args, "private") or s3["private"] or self.s3_config.private self.s3_config.endpoint_url = s3["endpoint_url"] or self.s3_config.endpoint_url self.s3_config.cdn_url = s3["cdn_url"] or self.s3_config.cdn_url + self.s3_config.key_path = s3["key_path"] or self.s3_config.key_path + self.s3_config.no_folder = s3["no_folder"] or self.s3_config.no_folder else: logger.debug(f"'s3' key not present in the {self.config_file=}") - if "wayback" in self.config: + if "wayback" in secrets: self.wayback_config = WaybackConfig( - key=self.config["wayback"]["key"], - secret=self.config["wayback"]["secret"], + key=secrets["wayback"]["key"], + secret=secrets["wayback"]["secret"], ) else: logger.debug(f"'wayback' key not present in the {self.config_file=}") - if "telegram" in self.config: + if "telegram" in secrets: self.telegram_config = TelegramConfig( - api_id=self.config["telegram"]["api_id"], - api_hash=self.config["telegram"]["api_hash"] + api_id=secrets["telegram"]["api_id"], + api_hash=secrets["telegram"]["api_hash"] ) else: logger.debug(f"'telegram' key not present in the {self.config_file=}") self.gsheets_client = gspread.service_account( - filename=self.config.get("google_api", {}).get("filename", 'service_account.json') + filename=secrets.get("google_api", {}).get("filename", 'service_account.json') ) + del self.config["secrets"] def get_argument_parser(self): parser = argparse.ArgumentParser(description='Automatically archive social media videos from a Google Sheets document') @@ -106,6 +121,19 @@ class Config: return parser + def set_folder(self, folder): + # update the folder in each of the storages + self.folder = folder + self.s3_config.folder = folder + + def get_storage(self): + if self.storage == "s3": + return S3Storage(self.s3_config) + elif self.storage == "local": + return LocalStorage(self.folder) + raise f"storage {self.storage} not yet implemented" + + def __str__(self) -> str: return json.dumps({ "config_file": self.config_file, diff --git a/storages/base_storage.py b/storages/base_storage.py index e1bf9c7..7494072 100644 --- a/storages/base_storage.py +++ b/storages/base_storage.py @@ -3,6 +3,7 @@ from abc import ABC, abstractmethod class Storage(ABC): + TMP_FOLDER = "tmp/" @abstractmethod def __init__(self, config): pass diff --git a/storages/s3_storage.py b/storages/s3_storage.py index 084153d..ca36597 100644 --- a/storages/s3_storage.py +++ b/storages/s3_storage.py @@ -1,7 +1,10 @@ +import uuid, os +from dataclasses import dataclass + import boto3 from botocore.errorfactory import ClientError + from .base_storage import Storage -from dataclasses import dataclass @dataclass @@ -14,6 +17,8 @@ class S3Config: endpoint_url: str = "https://{region}.digitaloceanspaces.com" cdn_url: str = "https://{bucket}.{region}.cdn.digitaloceanspaces.com/{key}" private: bool = False + key_path: str = "default" + no_folder: bool = False # when true folders are not used for url path class S3Storage(Storage): @@ -21,25 +26,45 @@ class S3Storage(Storage): def __init__(self, config: S3Config): self.bucket = config.bucket self.region = config.region - self.folder = config.folder self.private = config.private + self.cdn_url = config.cdn_url + self.key_path = config.key_path - if len(self.folder) and self.folder[-1] != '/': - self.folder += '/' + if config.no_folder: + self.folder = "" + else: + self.folder = config.folder + if len(self.folder) and self.folder[-1] != '/': + self.folder += '/' + + if self.key_path == "random": + self.key_dict = {} # key => randomKey self.s3 = boto3.client( 's3', - region_name=self.region, - endpoint_url=f'https://{self.region}.digitaloceanspaces.com', + region_name=config.region, + endpoint_url=config.endpoint_url.format(region=config.region), aws_access_key_id=config.key, aws_secret_access_key=config.secret ) def _get_path(self, key): - return self.folder + key + """ + Depends on the self.key_path configuration: + * random - assigns a random UUID which can be used in conjunction with "private=false" to have unguessable documents publicly available -> self.folder/randomUUID + * default -> defaults to self.folder/key + """ + # defaults to /key + final_key = key + if self.key_path == "random": + if key not in self.key_dict: + ext = os.path.splitext(key)[1] + self.key_dict[key] = f"{str(uuid.uuid4())}{ext}" + final_key = self.key_dict[key] + return self.folder + final_key def get_cdn_url(self, key): - return f'https://{self.bucket}.{self.region}.cdn.digitaloceanspaces.com/{self._get_path(key)}' + return self.cdn_url.format(bucket=self.bucket, region=self.region, key=self._get_path(key)) def exists(self, key): try: From 6bd6f88b46220ac3bdd82e1936699ca0aa372be8 Mon Sep 17 00:00:00 2001 From: msramalho <19508417+msramalho@users.noreply.github.com> Date: Mon, 9 May 2022 17:45:54 +0200 Subject: [PATCH 09/84] refactor --- archivers/base_archiver.py | 7 +++---- archivers/telegram_archiver.py | 3 ++- archivers/telethon_archiver.py | 8 ++++---- archivers/tiktok_archiver.py | 4 ++-- archivers/youtubedl_archiver.py | 4 ++-- configs/config.py | 6 +++--- 6 files changed, 16 insertions(+), 16 deletions(-) diff --git a/archivers/base_archiver.py b/archivers/base_archiver.py index aa85444..a395c3f 100644 --- a/archivers/base_archiver.py +++ b/archivers/base_archiver.py @@ -30,7 +30,6 @@ class ArchiveResult: class Archiver(ABC): name = "default" - TMP_FOLDER = "tmp/" def __init__(self, storage: Storage, driver): self.storage = storage @@ -61,7 +60,7 @@ class Archiver(ABC): page += f"" page_key = self.get_key(urlparse(url).path.replace("/", "_") + ".html") - page_filename = Archiver.TMP_FOLDER + page_key + page_filename = Storage.TMP_FOLDER + page_key page_cdn = self.storage.get_cdn_url(page_key) with open(page_filename, "w") as f: @@ -86,7 +85,7 @@ class Archiver(ABC): if '.' not in path: key += '.jpg' - filename = Archiver.TMP_FOLDER + key + filename = Storage.TMP_FOLDER + key d = requests.get(media_url, headers=headers) with open(filename, 'wb') as f: @@ -127,7 +126,7 @@ class Archiver(ABC): def get_screenshot(self, url): key = self.get_key(urlparse(url).path.replace( "/", "_") + datetime.datetime.utcnow().isoformat().replace(" ", "_") + ".png") - filename = Archiver.TMP_FOLDER + key + filename = Storage.TMP_FOLDER + key try: self.driver.get(url) diff --git a/archivers/telegram_archiver.py b/archivers/telegram_archiver.py index 5c586e6..aafba85 100644 --- a/archivers/telegram_archiver.py +++ b/archivers/telegram_archiver.py @@ -6,6 +6,7 @@ import re import html from .base_archiver import Archiver, ArchiveResult +from storages import Storage class TelegramArchiver(Archiver): @@ -52,7 +53,7 @@ class TelegramArchiver(Archiver): video_id = video_url.split('/')[-1].split('?')[0] key = self.get_key(video_id) - filename = Archiver.TMP_FOLDER + key + filename = Storage.TMP_FOLDER + key cdn_url = self.storage.get_cdn_url(key) if check_if_exists and self.storage.exists(key): diff --git a/archivers/telethon_archiver.py b/archivers/telethon_archiver.py index 69bd49a..eb740a9 100644 --- a/archivers/telethon_archiver.py +++ b/archivers/telethon_archiver.py @@ -70,8 +70,8 @@ class TelethonArchiver(Archiver): message = post.message for mp in media_posts: if len(mp.message) > len(message): message = mp.message - filename = self.client.download_media(mp.media, f'{Archiver.TMP_FOLDER}{chat}_{group_id}/{mp.id}') - key = filename.split(Archiver.TMP_FOLDER)[1] + filename = self.client.download_media(mp.media, f'{Storage.TMP_FOLDER}{chat}_{group_id}/{mp.id}') + key = filename.split(Storage.TMP_FOLDER)[1] self.storage.upload(filename, key) hash = self.get_hash(filename) cdn_url = self.storage.get_cdn_url(key) @@ -83,8 +83,8 @@ class TelethonArchiver(Archiver): return ArchiveResult(status=status, cdn_url=page_cdn, title=post.message, timestamp=post.date, hash=page_hash, screenshot=screenshot) elif len(media_posts) == 1: key = self.get_key(f'{chat}_{post_id}') - filename = self.client.download_media(post.media, f'{Archiver.TMP_FOLDER}{key}') - key = filename.split(Archiver.TMP_FOLDER)[1].replace(" ", "") + filename = self.client.download_media(post.media, f'{Storage.TMP_FOLDER}{key}') + key = filename.split(Storage.TMP_FOLDER)[1].replace(" ", "") self.storage.upload(filename, key) hash = self.get_hash(filename) cdn_url = self.storage.get_cdn_url(key) diff --git a/archivers/tiktok_archiver.py b/archivers/tiktok_archiver.py index 902e808..4daa675 100644 --- a/archivers/tiktok_archiver.py +++ b/archivers/tiktok_archiver.py @@ -3,7 +3,7 @@ import tiktok_downloader from loguru import logger from .base_archiver import Archiver, ArchiveResult - +from storages import Storage class TiktokArchiver(Archiver): name = "tiktok" @@ -18,7 +18,7 @@ class TiktokArchiver(Archiver): info = tiktok_downloader.info_post(url) key = self.get_key(f'{info.id}.mp4') cdn_url = self.storage.get_cdn_url(key) - filename = Archiver.TMP_FOLDER + key + filename = Storage.TMP_FOLDER + key if check_if_exists and self.storage.exists(key): status = 'already archived' diff --git a/archivers/youtubedl_archiver.py b/archivers/youtubedl_archiver.py index 493b792..bc0456e 100644 --- a/archivers/youtubedl_archiver.py +++ b/archivers/youtubedl_archiver.py @@ -5,11 +5,11 @@ import yt_dlp from loguru import logger from .base_archiver import Archiver, ArchiveResult - +from storages import Storage class YoutubeDLArchiver(Archiver): name = "youtube_dl" - ydl_opts = {'outtmpl': f'{Archiver.TMP_FOLDER}%(id)s.%(ext)s', 'quiet': False} + ydl_opts = {'outtmpl': f'{Storage.TMP_FOLDER}%(id)s.%(ext)s', 'quiet': False} def download(self, url, check_if_exists=False): netloc = self.get_netloc(url) diff --git a/configs/config.py b/configs/config.py index 6e68310..d5ef5ad 100644 --- a/configs/config.py +++ b/configs/config.py @@ -9,7 +9,7 @@ from utils.gworksheet import GWorksheet from storages import S3Config, S3Storage from .wayback_config import WaybackConfig from .telegram_config import TelegramConfig -from archivers import Archiver +from storages import Storage class Config: @@ -45,8 +45,8 @@ class Config: assert self.sheet is not None, "'sheet' must be provided either through command line or configuration file" self.header = int(getattr(self.args, "header") or execution.get("header", 1)) - self.tmp_folder = execution.get("tmp_folder", Archiver.TMP_FOLDER) - Archiver.TMP_FOLDER = self.tmp_folder + self.tmp_folder = execution.get("tmp_folder", Storage.TMP_FOLDER) + Storage.TMP_FOLDER = self.tmp_folder self.storage = execution.get("storage", "s3") From e0276dfab11467e4ec9e188d88306987ef1729e2 Mon Sep 17 00:00:00 2001 From: msramalho <19508417+msramalho@users.noreply.github.com> Date: Mon, 9 May 2022 18:19:38 +0200 Subject: [PATCH 10/84] additional cleanup --- auto_archive.py | 20 ++++---------------- utils/misc.py | 18 ++++++++++++++++-- 2 files changed, 20 insertions(+), 18 deletions(-) diff --git a/auto_archive.py b/auto_archive.py index af081ad..f7cd77b 100644 --- a/auto_archive.py +++ b/auto_archive.py @@ -1,18 +1,16 @@ # import os import datetime # import argparse -import requests import shutil # import gspread from loguru import logger from dotenv import load_dotenv -# from selenium import webdriver + import traceback # import archivers from archivers import TelethonArchiver, TelegramArchiver, TiktokArchiver, YoutubeDLArchiver, TwitterArchiver, WaybackArchiver, ArchiveResult -from storages import S3Storage -from utils import GWorksheet, mkdir_if_not_exists +from utils import GWorksheet, mkdir_if_not_exists, expand_url from configs import Config load_dotenv() @@ -52,17 +50,6 @@ def update_sheet(gw, row, result: ArchiveResult): gw.batch_set_cell(cell_updates) -def expand_url(url): - # expand short URL links - if 'https://t.co/' in url: - try: - r = requests.get(url) - url = r.url - except: - logger.error(f'Failed to expand url {url}') - return url - - def process_sheet(c: Config, sheet, header=1, columns=GWorksheet.COLUMN_NAMES): sh = c.gsheets_client.open(sheet) @@ -87,7 +74,8 @@ def process_sheet(c: Config, sheet, header=1, columns=GWorksheet.COLUMN_NAMES): # order matters, first to succeed excludes remaining active_archivers = [ - TelethonArchiver(storage, c.webdriver, c.telegram_config), + # TODO: uncomment once credentials are ready + # TelethonArchiver(storage, c.webdriver, c.telegram_config), TelegramArchiver(storage, c.webdriver), TiktokArchiver(storage, c.webdriver), YoutubeDLArchiver(storage, c.webdriver), diff --git a/utils/misc.py b/utils/misc.py index e8ef66d..5b1a688 100644 --- a/utils/misc.py +++ b/utils/misc.py @@ -1,5 +1,19 @@ -import os + +import os, requests +from loguru import logger + def mkdir_if_not_exists(folder): if not os.path.exists(folder): - os.mkdir(folder) \ No newline at end of file + os.mkdir(folder) + + +def expand_url(url): + # expand short URL links + if 'https://t.co/' in url: + try: + r = requests.get(url) + url = r.url + except: + logger.error(f'Failed to expand url {url}') + return url From b459f36dda899cc538b8b3e0c09f9495970eaf72 Mon Sep 17 00:00:00 2001 From: msramalho <19508417+msramalho@users.noreply.github.com> Date: Mon, 9 May 2022 18:23:01 +0200 Subject: [PATCH 11/84] C --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index 5854f68..fdefaf0 100644 --- a/README.md +++ b/README.md @@ -92,4 +92,5 @@ graph TD ```mermaid graph TD A(BaseStorage) -->|parent of| B(S3Storage) + C(BaseStorage) -->|parent of| C(LocalStorage) ``` From 39f27ec1bcb30048c3fcd125c329e3174e812835 Mon Sep 17 00:00:00 2001 From: msramalho <19508417+msramalho@users.noreply.github.com> Date: Tue, 10 May 2022 20:23:13 +0200 Subject: [PATCH 12/84] reenable telethon --- auto_archive.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/auto_archive.py b/auto_archive.py index f7cd77b..3af64c2 100644 --- a/auto_archive.py +++ b/auto_archive.py @@ -74,8 +74,7 @@ def process_sheet(c: Config, sheet, header=1, columns=GWorksheet.COLUMN_NAMES): # order matters, first to succeed excludes remaining active_archivers = [ - # TODO: uncomment once credentials are ready - # TelethonArchiver(storage, c.webdriver, c.telegram_config), + TelethonArchiver(storage, c.webdriver, c.telegram_config), TelegramArchiver(storage, c.webdriver), TiktokArchiver(storage, c.webdriver), YoutubeDLArchiver(storage, c.webdriver), From 94b37b02bad80f675802a00593d4d4b61c6f5acd Mon Sep 17 00:00:00 2001 From: msramalho <19508417+msramalho@users.noreply.github.com> Date: Tue, 10 May 2022 20:23:44 +0200 Subject: [PATCH 13/84] telethon refactor for failures --- archivers/telethon_archiver.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/archivers/telethon_archiver.py b/archivers/telethon_archiver.py index eb740a9..ee8c109 100644 --- a/archivers/telethon_archiver.py +++ b/archivers/telethon_archiver.py @@ -42,7 +42,6 @@ class TelethonArchiver(Archiver): return False status = "success" - screenshot = self.get_screenshot(url) with self.client.start(): matches = list(matches[0]) @@ -56,6 +55,8 @@ class TelethonArchiver(Archiver): return False media_posts = self._get_media_posts_in_group(chat, post) + + screenshot = self.get_screenshot(url) if len(media_posts) > 1: key = self.get_html_key(url) @@ -80,7 +81,7 @@ class TelethonArchiver(Archiver): page_cdn, page_hash, _ = self.generate_media_page_html(url, uploaded_media, html.escape(str(post))) - return ArchiveResult(status=status, cdn_url=page_cdn, title=post.message, timestamp=post.date, hash=page_hash, screenshot=screenshot) + return ArchiveResult(status=status, cdn_url=page_cdn, title=message, timestamp=post.date, hash=page_hash, screenshot=screenshot) elif len(media_posts) == 1: key = self.get_key(f'{chat}_{post_id}') filename = self.client.download_media(post.media, f'{Storage.TMP_FOLDER}{key}') From f6bc45361a30e11f197d99dcc20d6d41f198fa86 Mon Sep 17 00:00:00 2001 From: msramalho <19508417+msramalho@users.noreply.github.com> Date: Tue, 10 May 2022 20:48:40 +0200 Subject: [PATCH 14/84] ignore custom configs --- .gitignore | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index c61326d..4eeb410 100644 --- a/.gitignore +++ b/.gitignore @@ -9,4 +9,5 @@ anu.html *.log .pytest_cach anon* -config.json \ No newline at end of file +config.json +config-*.json \ No newline at end of file From d469967c0329bc508aaffaca2c1a6a7d5a694a44 Mon Sep 17 00:00:00 2001 From: msramalho <19508417+msramalho@users.noreply.github.com> Date: Tue, 10 May 2022 22:24:21 +0200 Subject: [PATCH 15/84] fix index out of range for empty sheets --- utils/gworksheet.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/utils/gworksheet.py b/utils/gworksheet.py index 6dec9b2..ec7fa3d 100644 --- a/utils/gworksheet.py +++ b/utils/gworksheet.py @@ -24,9 +24,12 @@ class GWorksheet: def __init__(self, worksheet, columns=COLUMN_NAMES, header_row=1): self.wks = worksheet - self.values = self.wks.get_values() - self.headers = [v.lower() for v in self.values[header_row - 1]] self.columns = columns + self.values = self.wks.get_values() + if len(self.values) > 0: + self.headers = [v.lower() for v in self.values[header_row - 1]] + else: + self.headers = [] def _check_col_exists(self, col: str): if col not in self.columns: From bca960b2280b56a7f0aca314d401c80419bf45c0 Mon Sep 17 00:00:00 2001 From: msramalho <19508417+msramalho@users.noreply.github.com> Date: Tue, 10 May 2022 23:09:33 +0200 Subject: [PATCH 16/84] merge from master and fixes --- archivers/youtubedl_archiver.py | 2 +- auto_archive.py | 52 ++++++++++++--------------------- configs/config.py | 36 ++++++++++++++++------- test.py | 51 ++++++++++++++++++++++++++++++++ 4 files changed, 95 insertions(+), 46 deletions(-) create mode 100644 test.py diff --git a/archivers/youtubedl_archiver.py b/archivers/youtubedl_archiver.py index 426641a..c83c7e3 100644 --- a/archivers/youtubedl_archiver.py +++ b/archivers/youtubedl_archiver.py @@ -17,7 +17,7 @@ class YoutubeDLArchiver(Archiver): def download(self, url, check_if_exists=False): netloc = self.get_netloc(url) - if netloc in ['facebook.com', 'www.facebook.com']: + if netloc in ['facebook.com', 'www.facebook.com'] and self.fb_cookie: logger.debug('Using Facebook cookie') yt_dlp.utils.std_headers['cookie'] = self.fb_cookie diff --git a/auto_archive.py b/auto_archive.py index 287e231..749e912 100644 --- a/auto_archive.py +++ b/auto_archive.py @@ -1,18 +1,14 @@ -# import os +import sys import datetime -# import argparse import shutil -# import gspread from loguru import logger from dotenv import load_dotenv import traceback -# import archivers from archivers import TelethonArchiver, TelegramArchiver, TiktokArchiver, YoutubeDLArchiver, TwitterArchiver, WaybackArchiver, ArchiveResult from utils import GWorksheet, mkdir_if_not_exists, expand_url from configs import Config -import sys logger.add("logs/1trace.log", level="TRACE") logger.add("logs/2info.log", level="INFO") @@ -79,16 +75,6 @@ def process_sheet(c: Config, sheet, header=1, columns=GWorksheet.COLUMN_NAMES): c.set_folder(f'{sheet.replace(" ", "_")}/{wks.title.replace(" ", "_")}/') storage = c.get_storage() - # order matters, first to succeed excludes remaining - active_archivers = [ - TelethonArchiver(storage, c.webdriver, c.telegram_config), - TelegramArchiver(storage, c.webdriver), - TiktokArchiver(storage, c.webdriver), - YoutubeDLArchiver(storage, c.webdriver), - TwitterArchiver(storage, c.webdriver), - WaybackArchiver(storage, c.webdriver) - archivers.YoutubeDLArchiver(s3_client, driver, os.getenv('FACEBOOK_COOKIE')), - ] # loop through rows in worksheet for row in range(1 + header, gw.count_rows() + 1): @@ -99,17 +85,19 @@ def process_sheet(c: Config, sheet, header=1, columns=GWorksheet.COLUMN_NAMES): gw.set_cell(row, 'status', 'Archive in progress') url = expand_url(url) - - # make a new driver so each spreadsheet row is idempotent - options = webdriver.FirefoxOptions() - options.headless = True - options.set_preference('network.protocol-handler.external.tg', False) + c.recreate_webdriver() + + # order matters, first to succeed excludes remaining + active_archivers = [ + TelethonArchiver(storage, c.webdriver, c.telegram_config), + TelegramArchiver(storage, c.webdriver), + TiktokArchiver(storage, c.webdriver), + YoutubeDLArchiver(storage, c.webdriver, c.facebook_cookie), + TwitterArchiver(storage, c.webdriver), + WaybackArchiver(storage, c.webdriver) + ] - driver = webdriver.Firefox(options=options) - driver.set_window_size(1400, 2000) - # in seconds, telegram screenshots catch which don't come back - driver.set_page_load_timeout(120) for archiver in active_archivers: logger.debug(f'Trying {archiver} on row {row}') @@ -121,23 +109,19 @@ def process_sheet(c: Config, sheet, header=1, columns=GWorksheet.COLUMN_NAMES): if result: if result.status in ['success', 'already archived']: - result.status = archiver.name + \ - ": " + str(result.status) - logger.success( - f'{archiver} succeeded on row {row}') + result.status = f"{archiver.name}: {result.status}" + logger.success(f'{archiver} succeeded on row {row}') break - logger.warning( - f'{archiver} did not succeed on row {row}, final status: {result.status}') - result.status = archiver.name + \ - ": " + str(result.status) - # get rid of driver so can reload on next row - driver.quit() + logger.warning(f'{archiver} did not succeed on row {row}, final status: {result.status}') + result.status = f"{archiver.name}: {result.status}" + if result: update_sheet(gw, row, result) else: gw.set_cell(row, 'status', 'failed: no archiver') logger.success(f'Finshed worksheet {wks.title}') + @logger.catch def main(): logger.debug(f'Passed args:{sys.argv}') diff --git a/configs/config.py b/configs/config.py index d5ef5ad..3ba5bcb 100644 --- a/configs/config.py +++ b/configs/config.py @@ -61,15 +61,20 @@ class Config: # selenium driver selenium_configs = execution.get("selenium", {}) self.selenium_timeout = int(selenium_configs.get("timeout_seconds", 10)) - options = webdriver.FirefoxOptions() - options.headless = True - options.set_preference('network.protocol-handler.external.tg', False) - self.webdriver = webdriver.Firefox(options=options) - self.webdriver.set_window_size(1400, 2000) - self.webdriver.set_page_load_timeout(self.selenium_timeout) + self.webdriver = "not initalized" - secrets = self.config.get("secrets", {}) # APIs and service configurations + secrets = self.config.get("secrets", {}) + + # google sheets config + self.gsheets_client = gspread.service_account( + filename=secrets.get("google_api", {}).get("filename", 'service_account.json') + ) + + # facebook config + self.facebook_cookie = secrets.get("facebook", {}).get("cookie", None) + + # s3 config if "s3" in secrets: s3 = secrets["s3"] self.s3_config = S3Config( @@ -86,6 +91,7 @@ class Config: else: logger.debug(f"'s3' key not present in the {self.config_file=}") + # wayback machine config if "wayback" in secrets: self.wayback_config = WaybackConfig( key=secrets["wayback"]["key"], @@ -94,6 +100,7 @@ class Config: else: logger.debug(f"'wayback' key not present in the {self.config_file=}") + # telethon config if "telegram" in secrets: self.telegram_config = TelegramConfig( api_id=secrets["telegram"]["api_id"], @@ -102,10 +109,6 @@ class Config: else: logger.debug(f"'telegram' key not present in the {self.config_file=}") - self.gsheets_client = gspread.service_account( - filename=secrets.get("google_api", {}).get("filename", 'service_account.json') - ) - del self.config["secrets"] def get_argument_parser(self): @@ -133,6 +136,17 @@ class Config: return LocalStorage(self.folder) raise f"storage {self.storage} not yet implemented" + def destroy_webdriver(self): + if self.webdriver is not None: + self.webdriver.quit() + + def recreate_webdriver(self): + options = webdriver.FirefoxOptions() + options.headless = True + options.set_preference('network.protocol-handler.external.tg', False) + self.webdriver = webdriver.Firefox(options=options) + self.webdriver.set_window_size(1400, 2000) + self.webdriver.set_page_load_timeout(self.selenium_timeout) def __str__(self) -> str: return json.dumps({ diff --git a/test.py b/test.py new file mode 100644 index 0000000..4061c9f --- /dev/null +++ b/test.py @@ -0,0 +1,51 @@ +import os +import datetime +import argparse +import requests +import shutil +import gspread +from loguru import logger +from dotenv import load_dotenv +from selenium import webdriver +import traceback + +import archivers +from storages import S3Storage, S3Config +from utils import GWorksheet, mkdir_if_not_exists + +load_dotenv() + + +options = webdriver.FirefoxOptions() +options.headless = True +driver = webdriver.Firefox(options=options) +driver.set_window_size(1400, 2000) + +s3_config = S3Config( + bucket=os.getenv('DO_BUCKET'), + region=os.getenv('DO_SPACES_REGION'), + key=os.getenv('DO_SPACES_KEY'), + secret=os.getenv('DO_SPACES_SECRET'), + folder="temp" +) +s3_client = S3Storage(s3_config) +telegram_config = archivers.TelegramConfig( + api_id=os.getenv('TELEGRAM_API_ID'), + api_hash=os.getenv('TELEGRAM_API_HASH') +) + +archiver = archivers.TelethonArchiver(s3_client, driver, telegram_config) + +URLs = [ + # "https://t.me/c/1226032830/24864", + # "https://t.me/truexanewsua/32650", + "https://t.me/informatsia_obstanovka/5239", + # "https://t.me/informatsia_obstanovka/5240", + # "https://t.me/informatsia_obstanovka/5241", + # "https://t.me/informatsia_obstanovka/5242" +] + + +for url in URLs: + print(url) + print(archiver.download(url, False)) From d7f44b948fdb3677c18e906d8232ea693db935ea Mon Sep 17 00:00:00 2001 From: msramalho <19508417+msramalho@users.noreply.github.com> Date: Tue, 10 May 2022 23:15:58 +0200 Subject: [PATCH 17/84] wayback fix --- archivers/wayback_archiver.py | 9 +++------ auto_archive.py | 2 +- 2 files changed, 4 insertions(+), 7 deletions(-) diff --git a/archivers/wayback_archiver.py b/archivers/wayback_archiver.py index b1e6824..10814dc 100644 --- a/archivers/wayback_archiver.py +++ b/archivers/wayback_archiver.py @@ -5,18 +5,15 @@ from bs4 import BeautifulSoup from storages import Storage from .base_archiver import Archiver, ArchiveResult -# @dataclass -# class WaybackConfig: -# key: str -# secret: str from configs import WaybackConfig # TODO: use WaybackConfig class WaybackArchiver(Archiver): name = "wayback" - def __init__(self, storage: Storage, driver): + def __init__(self, storage: Storage, driver, config: WaybackConfig): super(WaybackArchiver, self).__init__(storage, driver) + self.config = config self.seen_urls = {} def download(self, url, check_if_exists=False): @@ -25,7 +22,7 @@ class WaybackArchiver(Archiver): ia_headers = { "Accept": "application/json", - "Authorization": "LOW " + os.getenv('INTERNET_ARCHIVE_S3_KEY') + ":" + os.getenv('INTERNET_ARCHIVE_S3_SECRET') + "Authorization": f"LOW {self.config.key}:{self.config.secret}" } r = requests.post( diff --git a/auto_archive.py b/auto_archive.py index 749e912..833f944 100644 --- a/auto_archive.py +++ b/auto_archive.py @@ -95,7 +95,7 @@ def process_sheet(c: Config, sheet, header=1, columns=GWorksheet.COLUMN_NAMES): TiktokArchiver(storage, c.webdriver), YoutubeDLArchiver(storage, c.webdriver, c.facebook_cookie), TwitterArchiver(storage, c.webdriver), - WaybackArchiver(storage, c.webdriver) + WaybackArchiver(storage, c.webdriver, c.wayback_config) ] for archiver in active_archivers: From 2a01038c0c834ecfcc1cb9b7050193a71f40da03 Mon Sep 17 00:00:00 2001 From: msramalho <19508417+msramalho@users.noreply.github.com> Date: Wed, 11 May 2022 00:14:42 +0200 Subject: [PATCH 18/84] memleak --- configs/config.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/configs/config.py b/configs/config.py index 3ba5bcb..17fcedf 100644 --- a/configs/config.py +++ b/configs/config.py @@ -137,10 +137,11 @@ class Config: raise f"storage {self.storage} not yet implemented" def destroy_webdriver(self): - if self.webdriver is not None: + if self.webdriver is not None and type(self.webdriver) != str: self.webdriver.quit() def recreate_webdriver(self): + self.destroy_webdriver() options = webdriver.FirefoxOptions() options.headless = True options.set_preference('network.protocol-handler.external.tg', False) From ea261635a2354391b1c016beb3c31b87f5c35631 Mon Sep 17 00:00:00 2001 From: msramalho <19508417+msramalho@users.noreply.github.com> Date: Wed, 25 May 2022 10:32:26 +0200 Subject: [PATCH 19/84] cleanup --- .gitignore | 4 ++- archivers/wayback_archiver.py | 5 +--- auto_archive.py | 2 +- test.py | 51 ----------------------------------- 4 files changed, 5 insertions(+), 57 deletions(-) delete mode 100644 test.py diff --git a/.gitignore b/.gitignore index 4eeb410..d15b3e8 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,5 @@ tmp/ +temp/ .env* .DS_Store expmt/ @@ -10,4 +11,5 @@ anu.html .pytest_cach anon* config.json -config-*.json \ No newline at end of file +config-*.json +logs/* \ No newline at end of file diff --git a/archivers/wayback_archiver.py b/archivers/wayback_archiver.py index 10814dc..8a0a21f 100644 --- a/archivers/wayback_archiver.py +++ b/archivers/wayback_archiver.py @@ -1,13 +1,10 @@ -import time, requests, os +import time, requests from bs4 import BeautifulSoup -# from dataclasses import dataclass from storages import Storage from .base_archiver import Archiver, ArchiveResult from configs import WaybackConfig - -# TODO: use WaybackConfig class WaybackArchiver(Archiver): name = "wayback" diff --git a/auto_archive.py b/auto_archive.py index 833f944..141bdc8 100644 --- a/auto_archive.py +++ b/auto_archive.py @@ -133,7 +133,7 @@ def main(): mkdir_if_not_exists(c.tmp_folder) process_sheet(c, c.sheet, header=c.header, columns=c.column_names) shutil.rmtree(c.tmp_folder) - c.webdriver.quit() + c.destroy_webdriver() if __name__ == '__main__': diff --git a/test.py b/test.py deleted file mode 100644 index 4061c9f..0000000 --- a/test.py +++ /dev/null @@ -1,51 +0,0 @@ -import os -import datetime -import argparse -import requests -import shutil -import gspread -from loguru import logger -from dotenv import load_dotenv -from selenium import webdriver -import traceback - -import archivers -from storages import S3Storage, S3Config -from utils import GWorksheet, mkdir_if_not_exists - -load_dotenv() - - -options = webdriver.FirefoxOptions() -options.headless = True -driver = webdriver.Firefox(options=options) -driver.set_window_size(1400, 2000) - -s3_config = S3Config( - bucket=os.getenv('DO_BUCKET'), - region=os.getenv('DO_SPACES_REGION'), - key=os.getenv('DO_SPACES_KEY'), - secret=os.getenv('DO_SPACES_SECRET'), - folder="temp" -) -s3_client = S3Storage(s3_config) -telegram_config = archivers.TelegramConfig( - api_id=os.getenv('TELEGRAM_API_ID'), - api_hash=os.getenv('TELEGRAM_API_HASH') -) - -archiver = archivers.TelethonArchiver(s3_client, driver, telegram_config) - -URLs = [ - # "https://t.me/c/1226032830/24864", - # "https://t.me/truexanewsua/32650", - "https://t.me/informatsia_obstanovka/5239", - # "https://t.me/informatsia_obstanovka/5240", - # "https://t.me/informatsia_obstanovka/5241", - # "https://t.me/informatsia_obstanovka/5242" -] - - -for url in URLs: - print(url) - print(archiver.download(url, False)) From d33daabee1b986dfecd97315eb955f1e898ea3d9 Mon Sep 17 00:00:00 2001 From: msramalho <19508417+msramalho@users.noreply.github.com> Date: Fri, 3 Jun 2022 15:46:00 +0200 Subject: [PATCH 20/84] refactoring storages --- storages/base_storage.py | 7 +- storages/gd_storage.py | 200 ++++++++++++++------------------------ storages/local_storage.py | 6 +- storages/s3_storage.py | 9 +- 4 files changed, 85 insertions(+), 137 deletions(-) diff --git a/storages/base_storage.py b/storages/base_storage.py index bfb6911..3d9e361 100644 --- a/storages/base_storage.py +++ b/storages/base_storage.py @@ -5,6 +5,7 @@ from pathlib import Path class Storage(ABC): TMP_FOLDER = "tmp/" + @abstractmethod def __init__(self, config): pass @@ -28,18 +29,18 @@ class Storage(ABC): and others not, but that all can call """ for k, v in kwargs.items(): - if k in self.get_allowed_properties(): + if k in self._get_allowed_properties(): setattr(self, k, v) else: logger.warning(f'[{self.__class__.__name__}] does not accept dynamic property "{k}"') - def get_allowed_properties(self): + def _get_allowed_properties(self): """ child classes should specify which properties they allow to be set """ return set(["subfolder"]) - def clean_path(self, folder, default="", add_forward_slash=True): + def _clean_path(self, folder, default="", add_forward_slash=True): if folder is None or type(folder) != str or len(folder.strip()) == 0: return default return str(Path(folder)) + ("/" if add_forward_slash else "") diff --git a/storages/gd_storage.py b/storages/gd_storage.py index 3d65519..f5f4066 100644 --- a/storages/gd_storage.py +++ b/storages/gd_storage.py @@ -12,15 +12,14 @@ import time @dataclass class GDConfig: root_folder_id: str + default_upload_folder_name: str = "default" class GDStorage(Storage): - DEFAULT_UPLOAD_FOLDER_NAME = "default" - def __init__(self, config: GDConfig): + self.default_upload_folder_name = config.default_upload_folder_name self.root_folder_id = config.root_folder_id - SCOPES = ['https://www.googleapis.com/auth/drive'] - creds = service_account.Credentials.from_service_account_file('service_account.json', scopes=SCOPES) + creds = service_account.Credentials.from_service_account_file('service_account.json', scopes=['https://www.googleapis.com/auth/drive']) self.service = build('drive', 'v3', credentials=creds) def get_cdn_url(self, key): @@ -28,150 +27,54 @@ class GDStorage(Storage): only support files saved in a folder for GD S3 supports folder and all stored in the root """ - self.subfolder = self.clean_path(self.subfolder, GDStorage.DEFAULT_UPLOAD_FOLDER_NAME, False) + self.subfolder = self._clean_path(self.subfolder, self.default_upload_folder_name, False) filename = key logger.debug(f'Looking for {self.subfolder} and filename: {filename} on GD') - # retry policy on Google Drive - try_again = True - counter = 1 - folder_id = None - while try_again: - # need to lookup the id of folder eg SM0002 which should be there already as this is get_cdn_url - results = self.service.files().list( - q=f"'{self.root_folder_id}' in parents and name = '{self.subfolder}' ", - spaces='drive', # ie not appDataFolder or photos - fields='files(id, name)' - ).execute() - items = results.get('files', []) + folder_id = self._get_id_from_parent_and_name(self.root_folder_id, self.subfolder, 5, 10) - for item in items: - logger.debug(f"found folder of {item['name']}") - folder_id = item['id'] - try_again = False - - if folder_id is None: - logger.debug(f'Cannot find {self.subfolder=} waiting and trying again {counter=}') - counter += 1 - time.sleep(10) - if counter > 18: - raise ValueError(f'Cannot find {self.subfolder} and retried 18 times pausing 10s at a time which is 3 minutes') - - # check for sub folder in file eg youtube_dl_sDE-qZdi8p8/index.html' - # happens doing thumbnails + # check for sub folder in file youtube_dl_abcde/index.html, needed for thumbnails + # a='youtube_dl_abcde', b='index.html' a, _, b = filename.partition('/') - if b != '': - # a: 'youtube_dl_sDE-qZdi8p8' - # b: 'index.html' - logger.debug(f'get_cdn_url: Found a subfolder so need to split on a: {a} and {b}') - - # get id of the sub folder - results = self.service.files().list( - q=f"'{folder_id}' in parents and mimeType='application/vnd.google-apps.folder' and name = '{a}' ", - spaces='drive', # ie not appDataFolder or photos - fields='files(id, name)' - ).execute() - items = results.get('files', []) - - filename = None - for item in items: - folder_id = item['id'] - filename = b - if filename is None: - raise ValueError(f'Problem finding sub folder {a}') + logger.debug(f'get_cdn_url: Found a subfolder so need to split on: {a=} and {b=}') + folder_id = self._get_id_from_parent_and_name(folder_id, a, use_mime_type=True) + filename = b # get id of file inside folder (or sub folder) - results = self.service.files().list( - q=f"'{folder_id}' in parents and name = '{filename}' ", - spaces='drive', - fields='files(id, name)' - ).execute() - items = results.get('files', []) - - file_id = None - for item in items: - logger.debug(f"found file of {item['name']}") - file_id = item['id'] - - if file_id is None: - raise ValueError(f'Problem finding file {filename} in folder_id {folder_id}') - - foo = "https://drive.google.com/file/d/" + file_id + "/view?usp=sharing" - return foo + file_id = self._get_id_from_parent_and_name(folder_id, filename) + return f"https://drive.google.com/file/d/{file_id}/view?usp=sharing" def exists(self, _key): - # TODO: How to check for google drive, as it accepts different names + # TODO: How to check for google drive, as it accepts different names? return False def uploadf(self, file, key, **_kwargs): - logger.debug(f"before {self.subfolder=}") - self.subfolder = self.clean_path(self.subfolder, GDStorage.DEFAULT_UPLOAD_FOLDER_NAME, False) + """ + 1. check if subfolder exists or create it + 2. check if key contains sub-subfolder, check if exists or create it + 3. upload file to root_id/subfolder[/sub-subfolder]/filename + """ + self.subfolder = self._clean_path(self.subfolder, GDStorage.DEFAULT_UPLOAD_FOLDER_NAME, False) filename = key - logger.debug(f"after {self.subfolder=}") - # does folder eg SM0005 exist already inside parent of Files auto-archiver - results = self.service.files().list( - q=f"'{self.root_folder_id}' in parents and mimeType='application/vnd.google-apps.folder' and name = '{self.subfolder}' ", - spaces='drive', - fields='files(id, name)' - ).execute() - items = results.get('files', []) - folder_id_to_upload_to = None - if len(items) > 1: - logger.error(f'Duplicate folder name of {self.subfolder} which should never happen, but continuing anyway') - - for item in items: - logger.debug(f"Found existing folder of {item['name']}") - folder_id_to_upload_to = item['id'] + # get id of subfolder or create if it does not exist + folder_id_to_upload_to = self._get_id_from_parent_and_name(self.root_folder_id, self.subfolder, use_mime_type=True, raise_on_missing=False) if folder_id_to_upload_to is None: - logger.debug(f'Creating new folder {self.subfolder}') - file_metadata = { - 'name': [self.subfolder], - 'mimeType': 'application/vnd.google-apps.folder', - 'parents': [self.root_folder_id] - } - gd_file = self.service.files().create(body=file_metadata, fields='id').execute() - folder_id_to_upload_to = gd_file.get('id') + folder_id_to_upload_to = self._mkdir(self.subfolder, self.root_folder_id) - # check for subfolder name in file eg youtube_dl_sDE-qZdi8p8/out1.jpg', eg: thumbnails - # will always return a and a blank b even if there is nothing to split - # https://stackoverflow.com/a/38149500/26086 + # check for sub folder in file youtube_dl_abcde/index.html, needed for thumbnails + # a='youtube_dl_abcde', b='index.html' a, _, b = filename.partition('/') - if b != '': - # a: 'youtube_dl_sDE-qZdi8p8' - # b: 'out1.jpg' - logger.debug(f'uploadf: Found a subfolder so need to split on a: {a} and {b}') - - # does the 'a' folder exist already in folder_id_to_upload_to eg SM0005 - results = self.service.files().list( - q=f"'{folder_id_to_upload_to}' in parents and mimeType='application/vnd.google-apps.folder' and name = '{a}' ", - spaces='drive', # ie not appDataFolder or photos - fields='files(id, name)' - ).execute() - items = results.get('files', []) - sub_folder_id_to_upload_to = None - if len(items) > 1: - logger.error(f'Duplicate folder name of {a} which should never happen') - - for item in items: - logger.debug(f"Found existing folder of {item['name']}") - sub_folder_id_to_upload_to = item['id'] - + logger.debug(f'uploadf: Found a subfolder so need to split on: {a=} and {b=}') + # get id of subfolder or create if it does not exist + sub_folder_id_to_upload_to = self._get_id_from_parent_and_name(folder_id_to_upload_to, a, use_mime_type=True, raise_on_missing=False) if sub_folder_id_to_upload_to is None: - # create new folder - file_metadata = { - 'name': [a], - 'mimeType': 'application/vnd.google-apps.folder', - 'parents': [folder_id_to_upload_to] - } - gd_file = self.service.files().create(body=file_metadata, fields='id').execute() - sub_folder_id_to_upload_to = gd_file.get('id') + sub_folder_id_to_upload_to = self._mkdir(a, folder_id_to_upload_to) filename = b folder_id_to_upload_to = sub_folder_id_to_upload_to - # back to normal control flow # upload file to gd file_metadata = { @@ -180,8 +83,55 @@ class GDStorage(Storage): } media = MediaFileUpload(file, resumable=True) gd_file = self.service.files().create(body=file_metadata, media_body=media, fields='id').execute() + logger.debug(f'uploadf: uploaded file {gd_file["id"]} succesfully in folder={folder_id_to_upload_to}') def upload(self, filename: str, key: str, **kwargs): # GD only requires the filename not a file reader logger.debug(f'[{self.__class__.__name__}] uploading file {filename} with key {key}') self.uploadf(filename, key, **kwargs) + + def _get_id_from_parent_and_name(self, parent_id: str, name: str, retries: int = 1, sleep_seconds: int = 10, use_mime_type: bool = False, raise_on_missing: bool = True): + """ + Retrieves the id of a folder or file from its @name and the @parent_id folder + Optionally does multiple @retries and sleeps @sleep_seconds between them + If @use_mime_type will restrict search to "mimeType='application/vnd.google-apps.folder'" + If @raise_on_missing will throw error when not found, or returns None + Returns the id of the file or folder from its name as a string + """ + debug_header: str = f"[searching {name=} in {parent_id=}]" + query_string = f"'{parent_id}' in parents and name = '{name}' " + if use_mime_type: + query_string += f" and mimeType='application/vnd.google-apps.folder' " + + for attempt in range(retries): + results = self.service.files().list( + q=query_string, + spaces='drive', # ie not appDataFolder or photos + fields='files(id, name)' + ).execute() + items = results.get('files', []) + + if len(items) > 0: + logger.debug(f"{debug_header} found {len(items)} matches, returning last of {','.join([i['id'] for i in items])}") + return items[-1]['id'] + else: + logger.debug(f'{debug_header} not found, attempt {attempt+1}/{retries}. sleeping for {sleep_seconds} second(s)') + if attempt < retries - 1: time.sleep(sleep_seconds) + + if raise_on_missing: + raise ValueError(f'{debug_header} not found after {retries} attempt(s)') + return None + + def _mkdir(self, name: str, parent_id: str): + """ + Creates a new GDrive folder @name inside folder @parent_id + Returns id of the created folder + """ + logger.debug(f'[_mkdir] Creating new folder with {name=} inside {parent_id=}') + file_metadata = { + 'name': [name], + 'mimeType': 'application/vnd.google-apps.folder', + 'parents': [parent_id] + } + gd_folder = self.service.files().create(body=file_metadata, fields='id').execute() + return gd_folder.get('id') diff --git a/storages/local_storage.py b/storages/local_storage.py index 0dcdaef..f93446b 100644 --- a/storages/local_storage.py +++ b/storages/local_storage.py @@ -4,12 +4,10 @@ from .base_storage import Storage class LocalStorage(Storage): def __init__(self, folder): - self.folder = folder - if len(self.folder) and self.folder[-1] != '/': - self.folder += '/' + self.folder = self._clean_path(folder) def get_cdn_url(self, key): - return self.folder + key + return self.folder + self._clean_path(self.subfolder) + key def exists(self, key): return os.path.isfile(self.get_cdn_url(key)) diff --git a/storages/s3_storage.py b/storages/s3_storage.py index c637e25..5e882b3 100644 --- a/storages/s3_storage.py +++ b/storages/s3_storage.py @@ -19,8 +19,8 @@ class S3Config: endpoint_url: str = "https://{region}.digitaloceanspaces.com" cdn_url: str = "https://{bucket}.{region}.cdn.digitaloceanspaces.com/{key}" private: bool = False - key_path: str = "default" - no_folder: bool = False # when true folders are not used for url path + key_path: str = "default" # 'default' uses full naming, 'random' uses generated uuid + no_folder: bool = False # when true folders are not used for url path class S3Storage(Storage): @@ -28,7 +28,7 @@ class S3Storage(Storage): def __init__(self, config: S3Config): self.bucket = config.bucket self.region = config.region - self.folder = self.clean_path(config.folder) + self.folder = self._clean_path(config.folder) self.private = config.private self.cdn_url = config.cdn_url self.key_path = config.key_path @@ -54,8 +54,7 @@ class S3Storage(Storage): ext = os.path.splitext(key)[1] self.key_dict[key] = f"{str(uuid.uuid4())}{ext}" final_key = self.key_dict[key] - return self.folder + final_key - return self.folder + self.clean_path(self.subfolder) + key + return self.folder + self._clean_path(self.subfolder) + final_key def get_cdn_url(self, key): return self.cdn_url.format(bucket=self.bucket, region=self.region, key=self._get_path(key)) From c679e02c73ce865e06e0e9642c4c4949ca20e62c Mon Sep 17 00:00:00 2001 From: msramalho <19508417+msramalho@users.noreply.github.com> Date: Fri, 3 Jun 2022 17:32:02 +0200 Subject: [PATCH 21/84] updated storages init --- storages/__init__.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/storages/__init__.py b/storages/__init__.py index 8b39d7d..773c0b3 100644 --- a/storages/__init__.py +++ b/storages/__init__.py @@ -1,4 +1,5 @@ # we need to explicitly expose the available imports here -from .base_storage import * -from .local_storage import * -from .s3_storage import * \ No newline at end of file +from .base_storage import Storage +from .local_storage import LocalStorage +from .s3_storage import S3Config, S3Storage +from .gd_storage import GDConfig, GDStorage \ No newline at end of file From a2fdfacb265d1546c0b0cf3d47d30ccfc48f4707 Mon Sep 17 00:00:00 2001 From: msramalho <19508417+msramalho@users.noreply.github.com> Date: Fri, 3 Jun 2022 17:32:25 +0200 Subject: [PATCH 22/84] config refactor and cleanup --- configs/config.py | 102 +++++++++++++++++++++++++++-------------- storages/gd_storage.py | 10 ++-- 2 files changed, 74 insertions(+), 38 deletions(-) diff --git a/configs/config.py b/configs/config.py index 17fcedf..11ed808 100644 --- a/configs/config.py +++ b/configs/config.py @@ -3,13 +3,19 @@ import argparse, json import gspread from loguru import logger from selenium import webdriver -from storages.local_storage import LocalStorage +from dataclasses import dataclass from utils.gworksheet import GWorksheet -from storages import S3Config, S3Storage from .wayback_config import WaybackConfig from .telegram_config import TelegramConfig -from storages import Storage +from storages import Storage, S3Config, S3Storage, GDStorage, GDConfig, LocalStorage + + +@dataclass +class SeleniumConfig: + timeout_seconds: int = 120 + window_width: int = 1400 + window_height: int = 2000 class Config: @@ -20,8 +26,13 @@ class Config: c.parse() # parses the values and initializes the Services and API clients # you can then access the Services and APIs like c.s3_config - + All the configurations available as cmd line options, when included, will + override the configurations in the config.json file. + Configurations are split between: + 1. "secrets" containing API keys for generating services - not kept in memory + 2. "execution" containing specific execution configurations """ + AVAILABLE_STORAGES = {"s3", "gd", "local"} def __init__(self): self.parser = self.get_argument_parser() @@ -38,37 +49,40 @@ class Config: with open(self.config_file, "r", encoding="utf-8") as inf: self.config = json.load(inf) + # ----------------------EXECUTION - execution configurations execution = self.config.get("execution", {}) - # general sheet configurations - self.sheet = getattr(self.args, "sheet") or execution.get("sheet") + self.sheet = getattr(self.args, "sheet", execution.get("sheet")) assert self.sheet is not None, "'sheet' must be provided either through command line or configuration file" + self.header = int(getattr(self.args, "header", execution.get("header", 1))) + Storage.TMP_FOLDER = execution.get("tmp_folder", Storage.TMP_FOLDER) + self.storage = getattr(self.args, "storage", execution.get("storage", "s3")) - self.header = int(getattr(self.args, "header") or execution.get("header", 1)) - self.tmp_folder = execution.get("tmp_folder", Storage.TMP_FOLDER) - Storage.TMP_FOLDER = self.tmp_folder - - self.storage = execution.get("storage", "s3") + for key, name in [("s3", "s3"), ("gd", "google_drive")]: + assert self.storage != key or name in secrets, f"selected storage '{key}' requires secrets.'{name}' in {self.config_file}" # Column names come from config and can be overwritten by CMD # in the end all are considered as lower case config_column_names = execution.get("column_names", {}) self.column_names = {} for k in GWorksheet.COLUMN_NAMES.keys(): - self.column_names[k] = getattr(self.args, k) or config_column_names.get(k) or GWorksheet.COLUMN_NAMES[k] - self.column_names = {k: v.lower() for k, v in self.column_names.items()} + self.column_names[k] = getattr(self.args, k, config_column_names.get(k, GWorksheet.COLUMN_NAMES[k])).lower() # selenium driver selenium_configs = execution.get("selenium", {}) - self.selenium_timeout = int(selenium_configs.get("timeout_seconds", 10)) + self.selenium_config = SeleniumConfig( + timeout_seconds=int(selenium_configs.get("timeout_seconds", SeleniumConfig.timeout_seconds)), + window_width=int(selenium_configs.get("window_width", SeleniumConfig.window_width)), + window_height=int(selenium_configs.get("window_height", SeleniumConfig.window_height)) + ) self.webdriver = "not initalized" - # APIs and service configurations + # ---------------------- SECRETS - APIs and service configurations secrets = self.config.get("secrets", {}) # google sheets config self.gsheets_client = gspread.service_account( - filename=secrets.get("google_api", {}).get("filename", 'service_account.json') + filename=secrets.get("google_sheets", {}).get("service_account", 'service_account.json') ) # facebook config @@ -81,15 +95,22 @@ class Config: bucket=s3["bucket"], region=s3["region"], key=s3["key"], - secret=s3["secret"] + secret=s3["secret"], + endpoint_url=s3.get("endpoint_url", S3Config.endpoint_url), + cdn_url=s3.get("cdn_url", S3Config.cdn_url), + key_path=s3.get("key_path", S3Config.key_path), + private=getattr(self.args, "s3-private", s3.get("private", S3Config.private)), + no_folder=s3.get("no_folder", S3Config.no_folder), + ) + + # GDrive config + if "google_drive" in secrets: + gd = secrets["google_drive"] + self.gd_config = GDConfig( + root_folder_id=gd.get("root_folder_id"), + default_folder=gd.get("default_folder", GDConfig.default_folder), + service_account=gd.get("service_account", GDConfig.service_account), ) - self.s3_config.private = getattr(self.args, "private") or s3["private"] or self.s3_config.private - self.s3_config.endpoint_url = s3["endpoint_url"] or self.s3_config.endpoint_url - self.s3_config.cdn_url = s3["cdn_url"] or self.s3_config.cdn_url - self.s3_config.key_path = s3["key_path"] or self.s3_config.key_path - self.s3_config.no_folder = s3["no_folder"] or self.s3_config.no_folder - else: - logger.debug(f"'s3' key not present in the {self.config_file=}") # wayback machine config if "wayback" in secrets: @@ -109,32 +130,44 @@ class Config: else: logger.debug(f"'telegram' key not present in the {self.config_file=}") - del self.config["secrets"] + del self.config["secrets"] # delete to prevent leaks def get_argument_parser(self): - parser = argparse.ArgumentParser(description='Automatically archive social media videos from a Google Sheets document') + """ + Creates the CMD line arguments. 'python auto_archive.py --help' + """ + parser = argparse.ArgumentParser(description='Automatically archive social media posts, videos, and images from a Google Sheets document. The command line arguments will always override the configurations in the provided JSON config file (--config), only some high-level options are allowed via the command line and the JSON configuration file is the preferred method. ') parser.add_argument('--config', action='store', dest='config', help='the filename of the JSON configuration file (defaults to \'config.json\')', default='config.json') + parser.add_argument('--storage', action='store', dest='storage', help='which storage to use [execution.storage in config.json]', choices=Config.AVAILABLE_STORAGES) parser.add_argument('--sheet', action='store', dest='sheet', help='the name of the google sheets document [execution.sheet in config.json]') parser.add_argument('--header', action='store', dest='header', help='1-based index for the header row [execution.header in config.json]') - parser.add_argument('--private', action='store_true', help='Store content without public access permission [execution.header in config.json]') + parser.add_argument('--s3-private', action='store_true', help='Store content without public access permission (only for storage=s3) [secrets.s3.private in config.json]') for k, v in GWorksheet.COLUMN_NAMES.items(): - parser.add_argument(f'--col-{k}', action='store', dest=k, help=f'the name of the column to fill with {k} (default={v})') + parser.add_argument(f'--col-{k}', action='store', dest=k, help=f"name of the column to fill with {k} (default='{v}')") return parser def set_folder(self, folder): # update the folder in each of the storages self.folder = folder - self.s3_config.folder = folder + if self.s3_config: + self.s3_config.folder = folder + if self.gd_config: + self.gd_config.default_folder = folder def get_storage(self): + """ + creates and returns the configured type of storage + """ if self.storage == "s3": return S3Storage(self.s3_config) + elif self.storage == "gd": + return GDStorage(self.gd_config) elif self.storage == "local": return LocalStorage(self.folder) - raise f"storage {self.storage} not yet implemented" + raise f"storage {self.storage} not implemented, available: {Config.AVAILABLE_STORAGES}" def destroy_webdriver(self): if self.webdriver is not None and type(self.webdriver) != str: @@ -146,16 +179,17 @@ class Config: options.headless = True options.set_preference('network.protocol-handler.external.tg', False) self.webdriver = webdriver.Firefox(options=options) - self.webdriver.set_window_size(1400, 2000) - self.webdriver.set_page_load_timeout(self.selenium_timeout) + self.webdriver.set_window_size(self.selenium_config.window_width, + self.selenium_config.window_height) + self.webdriver.set_page_load_timeout(self.selenium_config.timeout_seconds) def __str__(self) -> str: return json.dumps({ "config_file": self.config_file, "sheet": self.sheet, "header": self.header, - "tmp_folder": self.tmp_folder, - "selenium_timeout_seconds": self.selenium_timeout, + "tmp_folder": Storage.TMP_FOLDER, + "selenium_config": self.selenium_config, "selenium_webdriver": self.webdriver != None, "s3_config": self.s3_config != None, "s3_private": getattr(self.s3_config, "private", None), diff --git a/storages/gd_storage.py b/storages/gd_storage.py index f5f4066..f4f2820 100644 --- a/storages/gd_storage.py +++ b/storages/gd_storage.py @@ -12,14 +12,16 @@ import time @dataclass class GDConfig: root_folder_id: str - default_upload_folder_name: str = "default" + default_folder: str = "default" + service_account: str = "service_account.json" class GDStorage(Storage): def __init__(self, config: GDConfig): - self.default_upload_folder_name = config.default_upload_folder_name + self.default_folder = config.default_folder self.root_folder_id = config.root_folder_id - creds = service_account.Credentials.from_service_account_file('service_account.json', scopes=['https://www.googleapis.com/auth/drive']) + creds = service_account.Credentials.from_service_account_file( + config.service_account, scopes=['https://www.googleapis.com/auth/drive']) self.service = build('drive', 'v3', credentials=creds) def get_cdn_url(self, key): @@ -27,7 +29,7 @@ class GDStorage(Storage): only support files saved in a folder for GD S3 supports folder and all stored in the root """ - self.subfolder = self._clean_path(self.subfolder, self.default_upload_folder_name, False) + self.subfolder = self._clean_path(self.subfolder, self.default_folder, False) filename = key logger.debug(f'Looking for {self.subfolder} and filename: {filename} on GD') From aaa1d299daf3c4c8bdba005fd6e690207ae3850f Mon Sep 17 00:00:00 2001 From: msramalho <19508417+msramalho@users.noreply.github.com> Date: Fri, 3 Jun 2022 17:32:55 +0200 Subject: [PATCH 23/84] started cleaning auto_archive --- auto_archive.py | 29 +---------------------------- 1 file changed, 1 insertion(+), 28 deletions(-) diff --git a/auto_archive.py b/auto_archive.py index c23e73e..5df2cd0 100644 --- a/auto_archive.py +++ b/auto_archive.py @@ -9,8 +9,7 @@ import traceback from archivers import TelethonArchiver, TelegramArchiver, TiktokArchiver, YoutubeDLArchiver, TwitterArchiver, WaybackArchiver, ArchiveResult from utils import GWorksheet, mkdir_if_not_exists, expand_url from configs import Config -import archivers -from storages import S3Storage, S3Config + from storages.gd_storage import GDConfig, GDStorage from utils import GWorksheet, mkdir_if_not_exists import sys @@ -21,9 +20,6 @@ logger.add("logs/3success.log", level="SUCCESS") logger.add("logs/4warning.log", level="WARNING") logger.add("logs/5error.log", level="ERROR") -load_dotenv() - - def update_sheet(gw, row, result: ArchiveResult): cell_updates = [] row_values = gw.get_row(row) @@ -61,25 +57,6 @@ def update_sheet(gw, row, result: ArchiveResult): def process_sheet(c: Config, sheet, header=1, columns=GWorksheet.COLUMN_NAMES): sh = c.gsheets_client.open(sheet) - -def process_sheet(sheet, storage="s3", header=1, columns=GWorksheet.COLUMN_NAMES): - gc = gspread.service_account(filename='service_account.json') - sh = gc.open(sheet) - - s3_config = S3Config( - bucket=os.getenv('DO_BUCKET'), - region=os.getenv('DO_SPACES_REGION'), - key=os.getenv('DO_SPACES_KEY'), - secret=os.getenv('DO_SPACES_SECRET') - ) - gd_config = GDConfig( - root_folder_id=os.getenv('GD_ROOT_FOLDER_ID'), - ) - telegram_config = archivers.TelegramConfig( - api_id=os.getenv('TELEGRAM_API_ID'), - api_hash=os.getenv('TELEGRAM_API_HASH') - ) - # loop through worksheets to check for ii, wks in enumerate(sh.worksheets()): logger.info(f'Opening worksheet {ii=}: {wks.title=} {header=}') @@ -99,9 +76,6 @@ def process_sheet(sheet, storage="s3", header=1, columns=GWorksheet.COLUMN_NAMES c.set_folder(f'{sheet.replace(" ", "_")}/{wks.title.replace(" ", "_")}/') storage = c.get_storage() - gd_config.folder = f'{sheet.replace(" ", "_")}/{wks.title.replace(" ", "_")}/' - gd_client = GDStorage(gd_config) - # loop through rows in worksheet for row in range(1 + header, gw.count_rows() + 1): url = gw.get_cell(row, 'url') @@ -188,7 +162,6 @@ def process_sheet(sheet, storage="s3", header=1, columns=GWorksheet.COLUMN_NAMES @logger.catch def main(): - logger.debug(f'Passed args:{sys.argv}') c = Config() c.parse() From 5135e97d3fdde49c6ef768e200aa8a1349f1285b Mon Sep 17 00:00:00 2001 From: msramalho <19508417+msramalho@users.noreply.github.com> Date: Fri, 3 Jun 2022 18:03:49 +0200 Subject: [PATCH 24/84] cleanup auto_archive and config --- Pipfile | 1 - Pipfile.lock | 8 -- archivers/wayback_archiver.py | 2 +- auto_archive.py | 175 ++++++++++++---------------------- configs/config.py | 13 ++- utils/misc.py | 3 +- 6 files changed, 75 insertions(+), 127 deletions(-) diff --git a/Pipfile b/Pipfile index 7e5cbd7..7e31a3c 100644 --- a/Pipfile +++ b/Pipfile @@ -6,7 +6,6 @@ name = "pypi" [packages] gspread = "*" boto3 = "*" -python-dotenv = "*" argparse = "*" beautifulsoup4 = "*" tiktok-downloader = {git = "https://github.com/msramalho/tiktok-downloader"} diff --git a/Pipfile.lock b/Pipfile.lock index e47d720..9ee753d 100644 --- a/Pipfile.lock +++ b/Pipfile.lock @@ -707,14 +707,6 @@ "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'", "version": "==2.8.2" }, - "python-dotenv": { - "hashes": [ - "sha256:b7e3b04a59693c42c36f9ab1cc2acc46fa5df8c78e178fc33a8d4cd05c8d498f", - "sha256:d92a187be61fe482e4fd675b6d52200e7be63a12b724abbf931a40ce4fa92938" - ], - "index": "pypi", - "version": "==0.20.0" - }, "requests": { "hashes": [ "sha256:68d7c56fd5a8999887728ef304a6d12edc7be74f1cfa47714fc8b414525c9a61", diff --git a/archivers/wayback_archiver.py b/archivers/wayback_archiver.py index bb18d2b..c76437e 100644 --- a/archivers/wayback_archiver.py +++ b/archivers/wayback_archiver.py @@ -78,6 +78,6 @@ class WaybackArchiver(Archiver): title = "Could not get title" screenshot = self.get_screenshot(url) - result = ArchiveResult(status='Internet Archive fallback', cdn_url=archive_url, title=title, screenshot=screenshot) + result = ArchiveResult(status='success', cdn_url=archive_url, title=title, screenshot=screenshot) self.seen_urls[url] = result return result diff --git a/auto_archive.py b/auto_archive.py index 5df2cd0..a0f8883 100644 --- a/auto_archive.py +++ b/auto_archive.py @@ -1,24 +1,12 @@ -import sys import datetime import shutil -from loguru import logger -from dotenv import load_dotenv - import traceback +from loguru import logger from archivers import TelethonArchiver, TelegramArchiver, TiktokArchiver, YoutubeDLArchiver, TwitterArchiver, WaybackArchiver, ArchiveResult from utils import GWorksheet, mkdir_if_not_exists, expand_url from configs import Config -from storages.gd_storage import GDConfig, GDStorage -from utils import GWorksheet, mkdir_if_not_exists -import sys - -logger.add("logs/1trace.log", level="TRACE") -logger.add("logs/2info.log", level="INFO") -logger.add("logs/3success.log", level="SUCCESS") -logger.add("logs/4warning.log", level="WARNING") -logger.add("logs/5error.log", level="ERROR") def update_sheet(gw, row, result: ArchiveResult): cell_updates = [] @@ -33,8 +21,7 @@ def update_sheet(gw, row, result: ArchiveResult): batch_if_valid('archive', result.cdn_url) batch_if_valid('date', True, datetime.datetime.utcnow().replace(tzinfo=datetime.timezone.utc).isoformat()) - batch_if_valid('thumbnail', result.thumbnail, - f'=IMAGE("{result.thumbnail}")') + batch_if_valid('thumbnail', result.thumbnail, f'=IMAGE("{result.thumbnail}")') batch_if_valid('thumbnail_index', result.thumbnail_index) batch_if_valid('title', result.title) batch_if_valid('duration', result.duration, str(result.duration)) @@ -54,109 +41,82 @@ def update_sheet(gw, row, result: ArchiveResult): gw.batch_set_cell(cell_updates) -def process_sheet(c: Config, sheet, header=1, columns=GWorksheet.COLUMN_NAMES): - sh = c.gsheets_client.open(sheet) +def missing_required_columns(gw: GWorksheet): + required_found = True + for required_col in ['url', 'status']: + if not gw.col_exists(required_col): + logger.warning(f'Required column for {required_col}: "{gw.columns[required_col]}" not found, skipping worksheet {gw.worksheet.title}') + required_found = False + return required_found + + +def process_sheet(c: Config): + sh = c.gsheets_client.open(c.sheet) # loop through worksheets to check for ii, wks in enumerate(sh.worksheets()): - logger.info(f'Opening worksheet {ii=}: {wks.title=} {header=}') - gw = GWorksheet(wks, header_row=header, columns=columns) + logger.info(f'Opening worksheet {ii=}: {wks.title=} {c.header=}') + gw = GWorksheet(wks, header_row=c.header, columns=c.column_names) - if not gw.col_exists('url'): - logger.warning( - f'No "{c.column_names["url"]}" column found, skipping worksheet {wks.title}') - continue - - if not gw.col_exists('status'): - logger.warning( - f'No "{c.column_names["status"]}" column found, skipping worksheet {wks.title}') - continue + if missing_required_columns(gw): continue # archives will be in a folder 'doc_name/worksheet_name' - c.set_folder(f'{sheet.replace(" ", "_")}/{wks.title.replace(" ", "_")}/') + # TODO: use slugify lib + c.set_folder(f'{c.sheet.replace(" ", "_")}/{wks.title.replace(" ", "_")}/') storage = c.get_storage() # loop through rows in worksheet - for row in range(1 + header, gw.count_rows() + 1): + for row in range(1 + c.header, gw.count_rows() + 1): url = gw.get_cell(row, 'url') original_status = gw.get_cell(row, 'status') status = gw.get_cell(row, 'status', fresh=original_status in ['', None] and url != '') - if url != '' and status in ['', None]: - gw.set_cell(row, 'status', 'Archive in progress') + if url == '' or status not in ['', None]: continue - url = expand_url(url) + # All checks done - archival process starts here + gw.set_cell(row, 'status', 'Archive in progress') + url = expand_url(url) + storage.update_properties(subfolder=gw.get_cell_or_default(row, 'subfolder')) - subfolder = gw.get_cell_or_default(row, 'subfolder') + # make a new driver so each spreadsheet row is idempotent + c.recreate_webdriver() - # make a new driver so each spreadsheet row is idempotent - c.recreate_webdriver() + # order matters, first to succeed excludes remaining + active_archivers = [ + TelethonArchiver(storage, c.webdriver, c.telegram_config), + TelegramArchiver(storage, c.webdriver), + TiktokArchiver(storage, c.webdriver), + YoutubeDLArchiver(storage, c.webdriver, c.facebook_cookie), + TwitterArchiver(storage, c.webdriver), + WaybackArchiver(storage, c.webdriver, c.wayback_config) + ] - # order matters, first to succeed excludes remaining - active_archivers = [ - TelethonArchiver(storage, c.webdriver, c.telegram_config), - TelegramArchiver(storage, c.webdriver), - TiktokArchiver(storage, c.webdriver), - YoutubeDLArchiver(storage, c.webdriver, c.facebook_cookie), - TwitterArchiver(storage, c.webdriver), - WaybackArchiver(storage, c.webdriver, c.wayback_config) - ] + for archiver in active_archivers: + logger.debug(f'Trying {archiver=} on {row=}') - storage_client = None - if storage == "s3": - storage_client = s3_client - elif storage == "gd": - storage_client = gd_client - else: - raise ValueError(f'Cant get storage_client {storage_client}') - storage_client.update_properties(subfolder=subfolder) - for archiver in active_archivers: - logger.debug(f'Trying {archiver} on row {row}') + try: + result = archiver.download(url, check_if_exists=True) + except KeyboardInterrupt: + # catches keyboard interruptions to do a clean exit + logger.warning(f"caught interrupt for {archiver=} on {row=}") + gw.set_cell(row, 'status', '') + c.destroy_webdriver() + exit() + except Exception as e: + result = False + logger.error(f'Got unexpected error in row {row} with {archiver=} for {url=}: {e}\n{traceback.format_exc()}') - try: - result = archiver.download(url, check_if_exists=True) - except KeyboardInterrupt: - logger.warning("caught interrupt") - gw.set_cell(row, 'status', '') - driver.quit() - exit() - except Exception as e: - result = False - logger.error(f'Got unexpected error in row {row} with archiver {archiver} for url {url}: {e}\n{traceback.format_exc()}') - - if result: - # IA is a Success I believe - or do we want to display a logger warning for it? - if result.status in ['success', 'already archived', 'Internet Archive fallback']: - result.status = archiver.name + \ - ": " + str(result.status) - logger.success( - f'{archiver} succeeded on row {row}, url {url}') - if result.status in ['success', 'already archived']: - result.status = f"{archiver.name}: {result.status}" - logger.success(f'{archiver} succeeded on row {row}') - break - logger.warning(f'{archiver} did not succeed on row {row}, final status: {result.status}') - result.status = f"{archiver.name}: {result.status}" - - - # wayback has seen this url before so keep existing status - if "wayback: Internet Archive fallback" in result.status: - logger.success( - f'wayback has seen this url before so keep existing status on row {row}') - result.status = result.status.replace(' (duplicate)', '') - result.status = str(result.status) + " (duplicate)" - break - - logger.warning( - f'{archiver} did not succeed on {row=}, final status: {result.status}') - result.status = archiver.name + \ - ": " + str(result.status) - # get rid of driver so can reload on next row - driver.quit() if result: - update_sheet(gw, row, result) - else: - gw.set_cell(row, 'status', 'failed: no archiver') + result.status = f"{archiver.name}: {result.status}" + if result.status in ['success', 'already archived']: + logger.success(f'{archiver=} succeeded on {row=}, {url=}') + break + logger.warning(f'{archiver} did not succeed on {row=}, final status: {result.status}') + + if result: + update_sheet(gw, row, result) + else: + gw.set_cell(row, 'status', 'failed: no archiver') logger.success(f'Finshed worksheet {wks.title}') @@ -164,26 +124,11 @@ def process_sheet(c: Config, sheet, header=1, columns=GWorksheet.COLUMN_NAMES): def main(): c = Config() c.parse() - logger.info(f'Opening document {c.sheet} for header {c.header}') - parser.add_argument('--storage', action='store', dest='storage', default='s3', help='which storage to use.', choices={"s3", "gd"}) - - for k, v in GWorksheet.COLUMN_NAMES.items(): - help = f"the name of the column to fill with {k} (defaults={v})" - if k == "subfolder": - help = f"the name of the column to read the {k} from (defaults={v})" - parser.add_argument(f'--col-{k}', action='store', dest=k, default=v, help=help) - mkdir_if_not_exists(c.tmp_folder) - process_sheet(c, c.sheet, header=c.header, columns=c.column_names) - shutil.rmtree(c.tmp_folder) + process_sheet(c) c.destroy_webdriver() - - logger.info(f'Opening document {args.sheet} for header {args.header}') - - mkdir_if_not_exists('tmp') - process_sheet(args.sheet, header=args.header, columns=config_columns) - shutil.rmtree('tmp') + shutil.rmtree(c.tmp_folder) if __name__ == '__main__': diff --git a/configs/config.py b/configs/config.py index 11ed808..b517f4a 100644 --- a/configs/config.py +++ b/configs/config.py @@ -37,6 +37,14 @@ class Config: def __init__(self): self.parser = self.get_argument_parser() self.folder = "" + self.set_log_files() + + def set_log_files(self): + logger.add("logs/1trace.log", level="TRACE") + logger.add("logs/2info.log", level="INFO") + logger.add("logs/3success.log", level="SUCCESS") + logger.add("logs/4warning.log", level="WARNING") + logger.add("logs/5error.log", level="ERROR") def parse(self): self.args = self.parser.parse_args() @@ -145,7 +153,10 @@ class Config: parser.add_argument('--s3-private', action='store_true', help='Store content without public access permission (only for storage=s3) [secrets.s3.private in config.json]') for k, v in GWorksheet.COLUMN_NAMES.items(): - parser.add_argument(f'--col-{k}', action='store', dest=k, help=f"name of the column to fill with {k} (default='{v}')") + help = f"the name of the column to FILL WITH {k} (default='{v}')" + if k in ["url", "subfolder"]: + help = f"the name of the column to READ {k} FROM (default='{v}')" + parser.add_argument(f'--col-{k}', action='store', dest=k, help=help) return parser diff --git a/utils/misc.py b/utils/misc.py index 5b1a688..2dfd683 100644 --- a/utils/misc.py +++ b/utils/misc.py @@ -13,7 +13,8 @@ def expand_url(url): if 'https://t.co/' in url: try: r = requests.get(url) - url = r.url + logger.debug(f'Expanded url {url} to {r.url}') + return r.url except: logger.error(f'Failed to expand url {url}') return url From 66e214afa4baddc5f8454d8f9ab11af829310f55 Mon Sep 17 00:00:00 2001 From: msramalho <19508417+msramalho@users.noreply.github.com> Date: Fri, 3 Jun 2022 18:23:45 +0200 Subject: [PATCH 25/84] minor improvements and cleanup archivers --- archivers/base_archiver.py | 30 ++++++++++++++---------------- 1 file changed, 14 insertions(+), 16 deletions(-) diff --git a/archivers/base_archiver.py b/archivers/base_archiver.py index a34589b..43cfa10 100644 --- a/archivers/base_archiver.py +++ b/archivers/base_archiver.py @@ -53,6 +53,9 @@ class Archiver(ABC): # generate the html page eg SM3013/twitter__minmyatnaing13_status_1499415562937503751.html def generate_media_page_html(self, url, urls_info: dict, object, thumbnail=None): + """ + Generates an index.html page where each @urls_info is displayed + """ page = f'''{url}

Archived media from {self.name}

@@ -81,6 +84,10 @@ class Archiver(ABC): # eg images in a tweet save to cloud storage def generate_media_page(self, urls, url, object): + """ + For a list of media urls, fetch them, upload them + and call self.generate_media_page_html with them + """ headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36' } @@ -95,19 +102,12 @@ class Archiver(ABC): filename = Storage.TMP_FOLDER + key - # eg media_url: https://pbs.twimg.com/media/FM7-ggCUYAQHKWW?format=jpg&name=orig d = requests.get(media_url, headers=headers) with open(filename, 'wb') as f: f.write(d.content) - # eg filename: 'tmp/twitter__media_FM7-ggCUYAQHKWW.jpg' - # eg key: 'twitter__media_FM7-ggCUYAQHKWW.jpg' - # or if using filename key: 'SM3013/twitter__media_FM7-ggCUYAQHKWW.jpg' self.storage.upload(filename, key) - hash = self.get_hash(filename) - - # eg 'https://testhashing.fra1.cdn.digitaloceanspaces.com/Test_Hashing/Sheet1/twitter__media_FM7-ggCUYAQHKWW.jpg' cdn_url = self.storage.get_cdn_url(key) if thumbnail is None: @@ -132,14 +132,12 @@ class Archiver(ABC): return f'{self.name}_{_id}{extension}' def get_hash(self, filename): - f = open(filename, "rb") - bytes = f.read() # read entire file as bytes - - # TODO: customizable hash - hash = hashlib.sha256(bytes) - # option to use SHA3_512 instead - # hash = hashlib.sha3_512(bytes) - f.close() + with open(filename, "rb") as f: + bytes = f.read() # read entire file as bytes + # TODO: customizable hash + hash = hashlib.sha256(bytes) + # option to use SHA3_512 instead + # hash = hashlib.sha3_512(bytes) return hash.hexdigest() def get_screenshot(self, url): @@ -155,7 +153,7 @@ class Archiver(ABC): foo = self.driver.find_element(By.XPATH, "//button[@data-cookiebanner='accept_only_essential_button']") foo.click() logger.debug(f'fb click worked') - # linux server needs a sleep otherwise facebook cookie wont have worked and we'll get a popup on next page + # linux server needs a sleep otherwise facebook cookie won't have worked and we'll get a popup on next page time.sleep(2) except: logger.warning(f'Failed on fb accept cookies for url {url}') From 5e495b713f38c9a7e9dc8e968b5ec1726c1be659 Mon Sep 17 00:00:00 2001 From: msramalho <19508417+msramalho@users.noreply.github.com> Date: Fri, 3 Jun 2022 18:23:53 +0200 Subject: [PATCH 26/84] minor update --- archivers/telethon_archiver.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/archivers/telethon_archiver.py b/archivers/telethon_archiver.py index 294db09..a64c3fd 100644 --- a/archivers/telethon_archiver.py +++ b/archivers/telethon_archiver.py @@ -54,11 +54,11 @@ class TelethonArchiver(Archiver): try: post = self.client.get_messages(chat, ids=post_id) except ValueError as e: - logger.error(f'Could not fetch telegram {url} possibly it\'s private: {e}') + logger.error(f"Could not fetch telegram {url} possibly it's private: {e}") return False except ChannelInvalidError as e: # TODO: check followup here: https://github.com/LonamiWebs/Telethon/issues/3819 - logger.error(f'Could not fetch telegram {url} possibly it\'s private or not displayable in : {e}') + logger.error(f"Could not fetch telegram {url} possibly it's private or not displayable in : {e}") return False media_posts = self._get_media_posts_in_group(chat, post) From e2d1a5d6be557213841efb02778a1a49cf97524b Mon Sep 17 00:00:00 2001 From: msramalho <19508417+msramalho@users.noreply.github.com> Date: Fri, 3 Jun 2022 18:30:12 +0200 Subject: [PATCH 27/84] import cleanups --- archivers/__init__.py | 4 ++-- archivers/base_archiver.py | 15 ++++----------- archivers/telegram_archiver.py | 7 +++---- archivers/telethon_archiver.py | 14 +++++++------- archivers/tiktok_archiver.py | 1 + archivers/wayback_archiver.py | 5 +++-- archivers/youtubedl_archiver.py | 4 ++-- configs/__init__.py | 2 +- configs/config.py | 4 ++-- .../{telegram_config.py => telethon_config.py} | 2 +- 10 files changed, 26 insertions(+), 32 deletions(-) rename configs/{telegram_config.py => telethon_config.py} (78%) diff --git a/archivers/__init__.py b/archivers/__init__.py index 26979c0..40fbb4b 100644 --- a/archivers/__init__.py +++ b/archivers/__init__.py @@ -1,8 +1,8 @@ # we need to explicitly expose the available imports here from .base_archiver import Archiver, ArchiveResult from .telegram_archiver import TelegramArchiver -from .telethon_archiver import TelethonArchiver, TelegramConfig +from .telethon_archiver import TelethonArchiver from .tiktok_archiver import TiktokArchiver -from .wayback_archiver import WaybackArchiver, WaybackConfig +from .wayback_archiver import WaybackArchiver from .youtubedl_archiver import YoutubeDLArchiver from .twitter_archiver import TwitterArchiver \ No newline at end of file diff --git a/archivers/base_archiver.py b/archivers/base_archiver.py index 43cfa10..f7d915f 100644 --- a/archivers/base_archiver.py +++ b/archivers/base_archiver.py @@ -1,23 +1,16 @@ -import os -import ffmpeg -import datetime -import shutil +import os, datetime, shutil, hashlib, time, requests from dataclasses import dataclass from abc import ABC, abstractmethod from urllib.parse import urlparse -import hashlib -import time -import requests + +import ffmpeg from loguru import logger from selenium.common.exceptions import TimeoutException +from selenium.webdriver.common.by import By from storages import Storage from utils import mkdir_if_not_exists -from selenium.webdriver.common.by import By -from loguru import logger -from selenium.common.exceptions import TimeoutException - @dataclass class ArchiveResult: diff --git a/archivers/telegram_archiver.py b/archivers/telegram_archiver.py index aafba85..06a8624 100644 --- a/archivers/telegram_archiver.py +++ b/archivers/telegram_archiver.py @@ -1,9 +1,8 @@ -import os -import requests +import os, requests, re + +import html from bs4 import BeautifulSoup from loguru import logger -import re -import html from .base_archiver import Archiver, ArchiveResult from storages import Storage diff --git a/archivers/telethon_archiver.py b/archivers/telethon_archiver.py index a64c3fd..09cb47f 100644 --- a/archivers/telethon_archiver.py +++ b/archivers/telethon_archiver.py @@ -1,20 +1,20 @@ -import os -import re +import os, re + import html from loguru import logger +from telethon.sync import TelegramClient +from telethon.errors import ChannelInvalidError from storages import Storage from .base_archiver import Archiver, ArchiveResult -from telethon.sync import TelegramClient -from telethon.errors import ChannelInvalidError -from configs import TelegramConfig +from configs import TelethonConfig class TelethonArchiver(Archiver): name = "telethon" link_pattern = re.compile(r"https:\/\/t\.me(\/c){0,1}\/(.+)\/(.+)") - def __init__(self, storage: Storage, driver, config: TelegramConfig): + def __init__(self, storage: Storage, driver, config: TelethonConfig): super().__init__(storage, driver) self.client = TelegramClient("./anon", config.api_id, config.api_hash) @@ -62,7 +62,7 @@ class TelethonArchiver(Archiver): return False media_posts = self._get_media_posts_in_group(chat, post) - + screenshot = self.get_screenshot(url) if len(media_posts) > 1: diff --git a/archivers/tiktok_archiver.py b/archivers/tiktok_archiver.py index c4534d7..f96ad59 100644 --- a/archivers/tiktok_archiver.py +++ b/archivers/tiktok_archiver.py @@ -5,6 +5,7 @@ from loguru import logger from .base_archiver import Archiver, ArchiveResult from storages import Storage + class TiktokArchiver(Archiver): name = "tiktok" diff --git a/archivers/wayback_archiver.py b/archivers/wayback_archiver.py index c76437e..72448ff 100644 --- a/archivers/wayback_archiver.py +++ b/archivers/wayback_archiver.py @@ -1,11 +1,12 @@ import time, requests + +from loguru import logger from bs4 import BeautifulSoup from storages import Storage from .base_archiver import Archiver, ArchiveResult - from configs import WaybackConfig -from loguru import logger + class WaybackArchiver(Archiver): diff --git a/archivers/youtubedl_archiver.py b/archivers/youtubedl_archiver.py index a080925..be3477d 100644 --- a/archivers/youtubedl_archiver.py +++ b/archivers/youtubedl_archiver.py @@ -1,6 +1,6 @@ -import os -import datetime +import os, datetime + import yt_dlp from loguru import logger diff --git a/configs/__init__.py b/configs/__init__.py index d7d3283..4409847 100644 --- a/configs/__init__.py +++ b/configs/__init__.py @@ -1,3 +1,3 @@ from .config import Config from .wayback_config import WaybackConfig -from .telegram_config import TelegramConfig \ No newline at end of file +from .telethon_config import TelethonConfig \ No newline at end of file diff --git a/configs/config.py b/configs/config.py index b517f4a..8e3a6ee 100644 --- a/configs/config.py +++ b/configs/config.py @@ -7,7 +7,7 @@ from dataclasses import dataclass from utils.gworksheet import GWorksheet from .wayback_config import WaybackConfig -from .telegram_config import TelegramConfig +from .telethon_config import TelethonConfig from storages import Storage, S3Config, S3Storage, GDStorage, GDConfig, LocalStorage @@ -131,7 +131,7 @@ class Config: # telethon config if "telegram" in secrets: - self.telegram_config = TelegramConfig( + self.telegram_config = TelethonConfig( api_id=secrets["telegram"]["api_id"], api_hash=secrets["telegram"]["api_hash"] ) diff --git a/configs/telegram_config.py b/configs/telethon_config.py similarity index 78% rename from configs/telegram_config.py rename to configs/telethon_config.py index f5553ad..adf121b 100644 --- a/configs/telegram_config.py +++ b/configs/telethon_config.py @@ -2,6 +2,6 @@ from dataclasses import dataclass @dataclass -class TelegramConfig: +class TelethonConfig: api_id: str api_hash: str \ No newline at end of file From e32c0788b7ede03ca9c43ccaa269f43329eef933 Mon Sep 17 00:00:00 2001 From: msramalho <19508417+msramalho@users.noreply.github.com> Date: Fri, 3 Jun 2022 19:40:26 +0200 Subject: [PATCH 28/84] minor update --- configs/config.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/configs/config.py b/configs/config.py index 8e3a6ee..f90eb96 100644 --- a/configs/config.py +++ b/configs/config.py @@ -24,8 +24,7 @@ class Config: Usage: c = Config() # initializes the argument parser c.parse() # parses the values and initializes the Services and API clients - # you can then access the Services and APIs like - c.s3_config + # you can then access the Services and APIs like 'c.s3_config' All the configurations available as cmd line options, when included, will override the configurations in the config.json file. Configurations are split between: From 24544b0fe8c5bb363ffeac7c71ea8e61b5cb2c7d Mon Sep 17 00:00:00 2001 From: msramalho <19508417+msramalho@users.noreply.github.com> Date: Tue, 7 Jun 2022 17:28:47 +0200 Subject: [PATCH 29/84] library updates --- Pipfile | 3 +- Pipfile.lock | 301 +++++++++++++++++++++++++++++---------------------- 2 files changed, 176 insertions(+), 128 deletions(-) diff --git a/Pipfile b/Pipfile index 7e31a3c..9fc7aca 100644 --- a/Pipfile +++ b/Pipfile @@ -8,7 +8,7 @@ gspread = "*" boto3 = "*" argparse = "*" beautifulsoup4 = "*" -tiktok-downloader = {git = "https://github.com/msramalho/tiktok-downloader"} +tiktok-downloader = "*" bs4 = "*" loguru = "*" ffmpeg-python = "*" @@ -20,6 +20,7 @@ google-api-python-client = "*" google-auth-httplib2 = "*" google-auth-oauthlib = "*" oauth2client = "*" +python-slugify = "*" [requires] python_version = "3.9" diff --git a/Pipfile.lock b/Pipfile.lock index 9ee753d..01091af 100644 --- a/Pipfile.lock +++ b/Pipfile.lock @@ -1,7 +1,7 @@ { "_meta": { "hash": { - "sha256": "25b858227d74cc232bba525d34dcf30f15d18d535a6e9c59555e85a0a2bd8c61" + "sha256": "e13fa011edc8726b15cc2a3ef30cd73a71ff33830ca853f6a5e7641f0a9a6f91" }, "pipfile-spec": 6, "requires": { @@ -50,19 +50,19 @@ }, "boto3": { "hashes": [ - "sha256:441b619067cb205bfcd0e66fe085c16989ab65bd348823013e11bef991c00a5c", - "sha256:65e45029d234ff94ba8aa3bacb9df00fbbb2f1d9ee7fd1c2e40f4815d12ec3f5" + "sha256:1bc562393d7985263e62828173eea6c7d61562031c646dc857a4f0fad1dfddbe", + "sha256:7625c5ed92bb7a953e03d2541bcbfcb66c3495f8d7b9421e47b4e2c280dc9162" ], "index": "pypi", - "version": "==1.22.9" + "version": "==1.24.3" }, "botocore": { "hashes": [ - "sha256:71962de55b053a0124a0514155f4cdcf0bce81795ffc2bd6e000c1594e99125a", - "sha256:a1d26b95aaa5b2e126df74b223d774fae7e6597bb55c363782178f5b87f0cad3" + "sha256:2d48f4ed77220d4cb6f1b1abbb1b782d1b12260645f6ba3f3cd9ae5c98546297", + "sha256:7be5962b956b5770799ba87b0bd2173230068d269982bdf8d16fabaa79483912" ], - "markers": "python_version >= '3.6'", - "version": "==1.25.9" + "markers": "python_full_version >= '3.7.0'", + "version": "==1.27.3" }, "brotli": { "hashes": [ @@ -141,11 +141,11 @@ }, "cachetools": { "hashes": [ - "sha256:4ebbd38701cdfd3603d1f751d851ed248ab4570929f2d8a7ce69e30c420b141c", - "sha256:8b3b8fa53f564762e5b221e9896798951e7f915513abf2ba072ce0f07f3f5a98" + "sha256:6a94c6402995a99c3970cc7e4884bb60b4a8639938157eeed436098bf9831757", + "sha256:f9f17d2aec496a9aa6b76f53e3b614c965223c061982d434d160f930c698a9db" ], "markers": "python_version ~= '3.7'", - "version": "==5.1.0" + "version": "==5.2.0" }, "certifi": { "hashes": [ @@ -223,7 +223,7 @@ "sha256:7682dc8afb30297001674575ea00d1814d808d6a36af415a82bd481d37ba7b8e", "sha256:bb4d8133cb15a609f44e8213d9b391b0809795062913b383c62be0ee95b1db48" ], - "markers": "python_version >= '3.7'", + "markers": "python_full_version >= '3.7.0'", "version": "==8.1.3" }, "cloudscraper": { @@ -233,6 +233,13 @@ ], "version": "==1.2.60" }, + "commonmark": { + "hashes": [ + "sha256:452f9dc859be7f06631ddcb328b6919c67984aca654e5fefb3914d54691aed60", + "sha256:da2f38c92590f83de410ba1a3cbceafbc74fee9def35f9251ba9a971d6d66fd9" + ], + "version": "==0.9.1" + }, "cryptography": { "hashes": [ "sha256:093cb351031656d3ee2f4fa1be579a8c69c754cf874206be1d4cf3b542042804", @@ -260,14 +267,6 @@ ], "version": "==37.0.2" }, - "faker": { - "hashes": [ - "sha256:0301ace8365d98f3d0bf6e9a40200c8548e845d3812402ae1daf589effe3fb01", - "sha256:b1903db92175d78051858128ada397c7dc76f376f6967975419da232b3ebd429" - ], - "markers": "python_version >= '3.6'", - "version": "==13.7.0" - }, "ffmpeg-python": { "hashes": [ "sha256:65225db34627c578ef0e11c8b1eb528bb35e024752f6f10b78c011f6f64c4127", @@ -278,18 +277,18 @@ }, "filelock": { "hashes": [ - "sha256:b795f1b42a61bbf8ec7113c341dad679d772567b936fbd1bf43c9a238e673e20", - "sha256:c7b5fdb219b398a5b28c8e4c1893ef5f98ece6a38c6ab2c22e26ec161556fed6" + "sha256:37def7b658813cda163b56fc564cdc75e86d338246458c4c28ae84cabefa2404", + "sha256:3a0fd85166ad9dbab54c9aec96737b744106dc5f15c0b09a6744a445299fcf04" ], - "markers": "python_version >= '3.7'", - "version": "==3.7.0" + "markers": "python_full_version >= '3.7.0'", + "version": "==3.7.1" }, "flask": { "hashes": [ "sha256:315ded2ddf8a6281567edb27393010fe3406188bafbfe65a3339d5787d89e477", "sha256:fad5b446feb0d6db6aec0c3184d16a8c1f6c3e464b511649c8918a9be100b4fe" ], - "markers": "python_version >= '3.7'", + "markers": "python_full_version >= '3.7.0'", "version": "==2.1.2" }, "future": { @@ -301,19 +300,19 @@ }, "google-api-core": { "hashes": [ - "sha256:065bb8e11c605fd232707ae50963dc1c8af5b3c95b4568887515985e6c1156b3", - "sha256:1b9f59236ce1bae9a687c1d4f22957e79a2669e53d032893f6bf0fca54f6931d" + "sha256:958024c6aa3460b08f35741231076a4dd9a4c819a6a39d44da9627febe8b28f0", + "sha256:ce1daa49644b50398093d2a9ad886501aa845e2602af70c3001b9f402a9d7359" ], "markers": "python_version >= '3.6'", - "version": "==2.8.0" + "version": "==2.8.1" }, "google-api-python-client": { "hashes": [ - "sha256:4527f7b8518a795624ab68da412d55628f83b98c67dd6a5d6edf725454f8b30b", - "sha256:600c43d7eac6e3536fdcad1d14ba9ee503edf4c7db0bd827e791bbf03b9f1330" + "sha256:159aa2d5f67998f39b06f28f38d6621389dda099c56f0fde46e9070dabdd5b40", + "sha256:a45fd3f318f79b3498d31de7e7db16d70b01672a755c88f56841183db908c576" ], "index": "pypi", - "version": "==2.48.0" + "version": "==2.50.0" }, "google-auth": { "hashes": [ @@ -341,19 +340,19 @@ }, "googleapis-common-protos": { "hashes": [ - "sha256:6b5ee59dc646eb61a8eb65ee1db186d3df6687c8804830024f32573298bca19b", - "sha256:ddcd955b5bb6589368f659fa475373faa1ed7d09cde5ba25e88513d87007e174" + "sha256:023eaea9d8c1cceccd9587c6af6c20f33eeeb05d4148670f2b0322dc1511700c", + "sha256:b09b56f5463070c2153753ef123f07d2e49235e89148e9b2459ec8ed2f68d7d3" ], "markers": "python_version >= '3.6'", - "version": "==1.56.1" + "version": "==1.56.2" }, "gspread": { "hashes": [ - "sha256:319766d90db05056293f7ee0ad2b35503a1a40683a75897a2922398cd2016283", - "sha256:c719e1c024a2a6f3b7d818fbe07c3886b26fd6504b64d1b1359cf242968213cd" + "sha256:21704b47d007c3b5fd34eddfa4c4a9dcd1ecc1dc615083b9c636127726e66c18", + "sha256:b6172b62fa899e3e4199d2d0ea1008b64305554ba08d3d3a96e9123824fdec48" ], "index": "pypi", - "version": "==5.3.2" + "version": "==5.4.0" }, "h11": { "hashes": [ @@ -381,18 +380,18 @@ }, "importlib-metadata": { "hashes": [ - "sha256:1208431ca90a8cca1a6b8af391bb53c1a2db74e5d1cef6ddced95d4b2062edc6", - "sha256:ea4c597ebf37142f827b8f39299579e31685c31d3a438b59f469406afd0f2539" + "sha256:5d26852efe48c0a32b0509ffbc583fda1a2266545a78d104a6f4aff3db17d700", + "sha256:c58c8eb8a762858f49e18436ff552e83914778e50e9d2f1660535ffb364552ec" ], "markers": "python_version < '3.10'", - "version": "==4.11.3" + "version": "==4.11.4" }, "itsdangerous": { "hashes": [ "sha256:2c2349112351b88699d8d4b6b075022c0808887cb7ad10069318a8b0bc88db44", "sha256:5dbbc68b317e5e42f327f9021763545dc3fc3bfe22e6deb96aaf1fc38874156a" ], - "markers": "python_version >= '3.7'", + "markers": "python_full_version >= '3.7.0'", "version": "==2.1.2" }, "jinja2": { @@ -400,7 +399,7 @@ "sha256:31351a702a408a9e7595a8fc6150fc3f43bb6bf7e319770cbc0db9df9437e852", "sha256:6088930bfe239f0e6710546ab9c19c9ef35e29792895fed6e6e31a023a182a61" ], - "markers": "python_version >= '3.7'", + "markers": "python_full_version >= '3.7.0'", "version": "==3.1.2" }, "jmespath": { @@ -408,7 +407,7 @@ "sha256:a490e280edd1f57d6de88636992d05b71e97d69a26a19f058ecf7d304474bf5e", "sha256:e8dcd576ed616f14ec02eed0005c85973b5890083313860136657e24784e4c04" ], - "markers": "python_version >= '3.7'", + "markers": "python_full_version >= '3.7.0'", "version": "==1.0.0" }, "loguru": { @@ -421,70 +420,72 @@ }, "lxml": { "hashes": [ - "sha256:078306d19a33920004addeb5f4630781aaeabb6a8d01398045fcde085091a169", - "sha256:0c1978ff1fd81ed9dcbba4f91cf09faf1f8082c9d72eb122e92294716c605428", - "sha256:1010042bfcac2b2dc6098260a2ed022968dbdfaf285fc65a3acf8e4eb1ffd1bc", - "sha256:1d650812b52d98679ed6c6b3b55cbb8fe5a5460a0aef29aeb08dc0b44577df85", - "sha256:20b8a746a026017acf07da39fdb10aa80ad9877046c9182442bf80c84a1c4696", - "sha256:2403a6d6fb61c285969b71f4a3527873fe93fd0abe0832d858a17fe68c8fa507", - "sha256:24f5c5ae618395ed871b3d8ebfcbb36e3f1091fd847bf54c4de623f9107942f3", - "sha256:28d1af847786f68bec57961f31221125c29d6f52d9187c01cd34dc14e2b29430", - "sha256:31499847fc5f73ee17dbe1b8e24c6dafc4e8d5b48803d17d22988976b0171f03", - "sha256:31ba2cbc64516dcdd6c24418daa7abff989ddf3ba6d3ea6f6ce6f2ed6e754ec9", - "sha256:330bff92c26d4aee79c5bc4d9967858bdbe73fdbdbacb5daf623a03a914fe05b", - "sha256:5045ee1ccd45a89c4daec1160217d363fcd23811e26734688007c26f28c9e9e7", - "sha256:52cbf2ff155b19dc4d4100f7442f6a697938bf4493f8d3b0c51d45568d5666b5", - "sha256:530f278849031b0eb12f46cca0e5db01cfe5177ab13bd6878c6e739319bae654", - "sha256:545bd39c9481f2e3f2727c78c169425efbfb3fbba6e7db4f46a80ebb249819ca", - "sha256:5804e04feb4e61babf3911c2a974a5b86f66ee227cc5006230b00ac6d285b3a9", - "sha256:5a58d0b12f5053e270510bf12f753a76aaf3d74c453c00942ed7d2c804ca845c", - "sha256:5f148b0c6133fb928503cfcdfdba395010f997aa44bcf6474fcdd0c5398d9b63", - "sha256:5f7d7d9afc7b293147e2d506a4596641d60181a35279ef3aa5778d0d9d9123fe", - "sha256:60d2f60bd5a2a979df28ab309352cdcf8181bda0cca4529769a945f09aba06f9", - "sha256:6259b511b0f2527e6d55ad87acc1c07b3cbffc3d5e050d7e7bcfa151b8202df9", - "sha256:6268e27873a3d191849204d00d03f65c0e343b3bcb518a6eaae05677c95621d1", - "sha256:627e79894770783c129cc5e89b947e52aa26e8e0557c7e205368a809da4b7939", - "sha256:62f93eac69ec0f4be98d1b96f4d6b964855b8255c345c17ff12c20b93f247b68", - "sha256:6d6483b1229470e1d8835e52e0ff3c6973b9b97b24cd1c116dca90b57a2cc613", - "sha256:6f7b82934c08e28a2d537d870293236b1000d94d0b4583825ab9649aef7ddf63", - "sha256:6fe4ef4402df0250b75ba876c3795510d782def5c1e63890bde02d622570d39e", - "sha256:719544565c2937c21a6f76d520e6e52b726d132815adb3447ccffbe9f44203c4", - "sha256:730766072fd5dcb219dd2b95c4c49752a54f00157f322bc6d71f7d2a31fecd79", - "sha256:74eb65ec61e3c7c019d7169387d1b6ffcfea1b9ec5894d116a9a903636e4a0b1", - "sha256:7993232bd4044392c47779a3c7e8889fea6883be46281d45a81451acfd704d7e", - "sha256:80bbaddf2baab7e6de4bc47405e34948e694a9efe0861c61cdc23aa774fcb141", - "sha256:86545e351e879d0b72b620db6a3b96346921fa87b3d366d6c074e5a9a0b8dadb", - "sha256:891dc8f522d7059ff0024cd3ae79fd224752676447f9c678f2a5c14b84d9a939", - "sha256:8a31f24e2a0b6317f33aafbb2f0895c0bce772980ae60c2c640d82caac49628a", - "sha256:8b99ec73073b37f9ebe8caf399001848fced9c08064effdbfc4da2b5a8d07b93", - "sha256:986b7a96228c9b4942ec420eff37556c5777bfba6758edcb95421e4a614b57f9", - "sha256:a1547ff4b8a833511eeaceacbcd17b043214fcdb385148f9c1bc5556ca9623e2", - "sha256:a2bfc7e2a0601b475477c954bf167dee6d0f55cb167e3f3e7cefad906e7759f6", - "sha256:a3c5f1a719aa11866ffc530d54ad965063a8cbbecae6515acbd5f0fae8f48eaa", - "sha256:a9f1c3489736ff8e1c7652e9dc39f80cff820f23624f23d9eab6e122ac99b150", - "sha256:aa0cf4922da7a3c905d000b35065df6184c0dc1d866dd3b86fd961905bbad2ea", - "sha256:ad4332a532e2d5acb231a2e5d33f943750091ee435daffca3fec0a53224e7e33", - "sha256:b2582b238e1658c4061ebe1b4df53c435190d22457642377fd0cb30685cdfb76", - "sha256:b6fc2e2fb6f532cf48b5fed57567ef286addcef38c28874458a41b7837a57807", - "sha256:b92d40121dcbd74831b690a75533da703750f7041b4bf951befc657c37e5695a", - "sha256:bbab6faf6568484707acc052f4dfc3802bdb0cafe079383fbaa23f1cdae9ecd4", - "sha256:c0b88ed1ae66777a798dc54f627e32d3b81c8009967c63993c450ee4cbcbec15", - "sha256:ce13d6291a5f47c1c8dbd375baa78551053bc6b5e5c0e9bb8e39c0a8359fd52f", - "sha256:db3535733f59e5605a88a706824dfcb9bd06725e709ecb017e165fc1d6e7d429", - "sha256:dd10383f1d6b7edf247d0960a3db274c07e96cf3a3fc7c41c8448f93eac3fb1c", - "sha256:e01f9531ba5420838c801c21c1b0f45dbc9607cb22ea2cf132844453bec863a5", - "sha256:e11527dc23d5ef44d76fef11213215c34f36af1608074561fcc561d983aeb870", - "sha256:e1ab2fac607842ac36864e358c42feb0960ae62c34aa4caaf12ada0a1fb5d99b", - "sha256:e1fd7d2fe11f1cb63d3336d147c852f6d07de0d0020d704c6031b46a30b02ca8", - "sha256:e9f84ed9f4d50b74fbc77298ee5c870f67cb7e91dcdc1a6915cb1ff6a317476c", - "sha256:ec4b4e75fc68da9dc0ed73dcdb431c25c57775383fec325d23a770a64e7ebc87", - "sha256:f10ce66fcdeb3543df51d423ede7e238be98412232fca5daec3e54bcd16b8da0", - "sha256:f63f62fc60e6228a4ca9abae28228f35e1bd3ce675013d1dfb828688d50c6e23", - "sha256:fa56bb08b3dd8eac3a8c5b7d075c94e74f755fd9d8a04543ae8d37b1612dd170", - "sha256:fa9b7c450be85bfc6cd39f6df8c5b8cbd76b5d6fc1f69efec80203f9894b885f" + "sha256:00f3a6f88fd5f4357844dd91a1abac5f466c6799f1b7f1da2df6665253845b11", + "sha256:024684e0c5cfa121c22140d3a0898a3a9b2ea0f0fd2c229b6658af4bdf1155e5", + "sha256:03370ec37fe562238d385e2c53089076dee53aabf8325cab964fdb04a9130fa0", + "sha256:0aa4cce579512c33373ca4c5e23c21e40c1aa1a33533a75e51b654834fd0e4f2", + "sha256:1057356b808d149bc14eb8f37bb89129f237df488661c1e0fc0376ca90e1d2c3", + "sha256:11d62c97ceff9bab94b6b29c010ea5fb6831743459bb759c917f49ba75601cd0", + "sha256:1254a79f8a67a3908de725caf59eae62d86738f6387b0a34b32e02abd6ae73db", + "sha256:1bfb791a8fcdbf55d1d41b8be940393687bec0e9b12733f0796668086d1a23ff", + "sha256:28cf04a1a38e961d4a764d2940af9b941b66263ed5584392ef875ee9c1e360a3", + "sha256:2b9c2341d96926b0d0e132e5c49ef85eb53fa92ae1c3a70f9072f3db0d32bc07", + "sha256:2d10659e6e5c53298e6d718fd126e793285bff904bb71d7239a17218f6a197b7", + "sha256:3af00ee88376022589ceeb8170eb67dacf5f7cd625ea59fa0977d719777d4ae8", + "sha256:3cf816aed8125cfc9e6e5c6c31ff94278320d591bd7970c4a0233bee0d1c8790", + "sha256:4becd16750ca5c2a1b1588269322b2cebd10c07738f336c922b658dbab96a61c", + "sha256:4cd69bca464e892ea4ed544ba6a7850aaff6f8d792f8055a10638db60acbac18", + "sha256:4e97c8fc761ad63909198acc892f34c20f37f3baa2c50a62d5ec5d7f1efc68a1", + "sha256:520461c36727268a989790aef08884347cd41f2d8ae855489ccf40b50321d8d7", + "sha256:53b0410b220766321759f7f9066da67b1d0d4a7f6636a477984cbb1d98483955", + "sha256:56e19fb6e4b8bd07fb20028d03d3bc67bcc0621347fbde64f248e44839771756", + "sha256:5a49ad78543925e1a4196e20c9c54492afa4f1502c2a563f73097e2044c75190", + "sha256:5d52e1173f52020392f593f87a6af2d4055dd800574a5cb0af4ea3878801d307", + "sha256:607224ffae9a0cf0a2f6e14f5f6bce43e83a6fbdaa647891729c103bdd6a5593", + "sha256:612ef8f2795a89ba3a1d4c8c1af84d8453fd53ee611aa5ad460fdd2cab426fc2", + "sha256:615886ee84b6f42f1bdf1852a9669b5fe3b96b6ff27f1a7a330b67ad9911200a", + "sha256:63419db39df8dc5564f6f103102c4665f7e4d9cb64030e98cf7a74eae5d5760d", + "sha256:6467626fa74f96f4d80fc6ec2555799e97fff8f36e0bfc7f67769f83e59cff40", + "sha256:65b3b5f12c6fb5611e79157214f3cd533083f9b058bf2fc8a1c5cc5ee40fdc5a", + "sha256:686565ac77ff94a8965c11829af253d9e2ce3bf0d9225b1d2eb5c4d4666d0dca", + "sha256:6af7f51a6010748fc1bb71917318d953c9673e4ae3f6d285aaf93ef5b2eb11c1", + "sha256:70a198030d26f5e569367f0f04509b63256faa76a22886280eea69a4f535dd40", + "sha256:754a1dd04bff8a509a31146bd8f3a5dc8191a8694d582dd5fb71ff09f0722c22", + "sha256:75da29a0752c8f2395df0115ac1681cefbdd4418676015be8178b733704cbff2", + "sha256:81c29c8741fa07ecec8ec7417c3d8d1e2f18cf5a10a280f4e1c3f8c3590228b2", + "sha256:9093a359a86650a3dbd6532c3e4d21a6f58ba2cb60d0e72db0848115d24c10ba", + "sha256:915ecf7d486df17cc65aeefdb680d5ad4390cc8c857cf8db3fe241ed234f856a", + "sha256:94b181dd2777890139e49a5336bf3a9a3378ce66132c665fe8db4e8b7683cde2", + "sha256:94f2e45b054dd759bed137b6e14ae8625495f7d90ddd23cf62c7a68f72b62656", + "sha256:9af19eb789d674b59a9bee5005779757aab857c40bf9cc313cb01eafac55ce55", + "sha256:9cae837b988f44925d14d048fa6a8c54f197c8b1223fd9ee9c27084f84606143", + "sha256:aa7447bf7c1a15ef24e2b86a277b585dd3f055e8890ac7f97374d170187daa97", + "sha256:b1e22f3ee4d75ca261b6bffbf64f6f178cb194b1be3191065a09f8d98828daa9", + "sha256:b5031d151d6147eac53366d6ec87da84cd4d8c5e80b1d9948a667a7164116e39", + "sha256:b62d1431b4c40cda43cc986f19b8c86b1d2ae8918cfc00f4776fdf070b65c0c4", + "sha256:b71c52d69b91af7d18c13aef1b0cc3baee36b78607c711eb14a52bf3aa7c815e", + "sha256:b7679344f2270840dc5babc9ccbedbc04f7473c1f66d4676bb01680c0db85bcc", + "sha256:bb7c1b029e54e26e01b1d1d912fc21abb65650d16ea9a191d026def4ed0859ed", + "sha256:c2a57755e366e0ac7ebdb3e9207f159c3bf1afed02392ab18453ce81f5ee92ee", + "sha256:cf9ec915857d260511399ab87e1e70fa13d6b2972258f8e620a3959468edfc32", + "sha256:d0d03b9636f1326772e6854459728676354d4c7731dae9902b180e2065ba3da6", + "sha256:d1690c4d37674a5f0cdafbc5ed7e360800afcf06928c2a024c779c046891bf09", + "sha256:d76da27f5e3e9bc40eba6ed7a9e985f57547e98cf20521d91215707f2fb57e0f", + "sha256:d882c2f3345261e898b9f604be76b61c901fbfa4ac32e3f51d5dc1edc89da3cb", + "sha256:d8e5021e770b0a3084c30dda5901d5fce6d4474feaf0ced8f8e5a82702502fbb", + "sha256:dd00d28d1ab5fa7627f5abc957f29a6338a7395b724571a8cbff8fbed83aaa82", + "sha256:e35a298691b9e10e5a5631f8f0ba605b30ebe19208dc8f58b670462f53753641", + "sha256:e4d020ecf3740b7312bacab2cb966bb720fd4d3490562d373b4ad91dd1857c0d", + "sha256:e564d5a771b4015f34166a05ea2165b7e283635c41b1347696117f780084b46d", + "sha256:ea3f2e9eb41f973f73619e88bf7bd950b16b4c2ce73d15f24a11800ce1eaf276", + "sha256:eabdbe04ee0a7e760fa6cd9e799d2b020d098c580ba99107d52e1e5e538b1ecb", + "sha256:f17b9df97c5ecdfb56c5e85b3c9df9831246df698f8581c6e111ac664c7c656e", + "sha256:f386def57742aacc3d864169dfce644a8c396f95aa35b41b69df53f558d56dd0", + "sha256:f6d23a01921b741774f35e924d418a43cf03eca1444f3fdfd7978d35a5aaab8b", + "sha256:fcdf70191f0d1761d190a436db06a46f05af60e1410e1507935f0332280c9268" ], "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'", - "version": "==4.8.0" + "version": "==4.9.0" }, "markupsafe": { "hashes": [ @@ -529,7 +530,7 @@ "sha256:f121a1420d4e173a5d96e47e9a0c0dcff965afdf1626d28de1460815f7c4ee7a", "sha256:fc7b548b17d238737688817ab67deebb30e8073c95749d55538ed473130ec0c7" ], - "markers": "python_version >= '3.7'", + "markers": "python_full_version >= '3.7.0'", "version": "==2.1.1" }, "mutagen": { @@ -591,7 +592,7 @@ "sha256:e250a42f15bf9d5b09fe1b293bdba2801cd520a9f5ea2d7fb7536d4441811d20", "sha256:ff8d8fa42675249bb456f5db06c00de6c2f4c27a065955917b28c4f15978b9c3" ], - "markers": "python_version >= '3.7'", + "markers": "python_full_version >= '3.7.0'", "version": "==3.20.1" }, "pyaes": { @@ -676,6 +677,14 @@ "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'", "version": "==3.14.1" }, + "pygments": { + "hashes": [ + "sha256:5eb116118f9612ff1ee89ac96437bb6b49e8f04d8a13b514ba26f620208e26eb", + "sha256:dc9c10fb40944260f6ed4c688ece0cd2048414940f1cea51b8b226318411c519" + ], + "markers": "python_version >= '3.6'", + "version": "==2.12.0" + }, "pyopenssl": { "hashes": [ "sha256:660b1b1425aac4a1bea1d94168a85d99f0b3144c869dd4390d27629d0087f1bf", @@ -685,11 +694,11 @@ }, "pyparsing": { "hashes": [ - "sha256:7bf433498c016c4314268d95df76c81b842a4cb2b276fa3312cfb1e1d85f6954", - "sha256:ef7b523f6356f763771559412c0d7134753f037822dad1b16945b7b846f7ad06" + "sha256:2b020ecf7d21b687f219b71ecad3631f644a47f01403fa1d1036b0c6416d70fb", + "sha256:5026bae9a10eeaefb61dab2f09052b9f4307d44aee4eda64b309723d8d206bbc" ], - "markers": "python_full_version >= '3.6.8'", - "version": "==3.0.8" + "markers": "python_version >= '3.1'", + "version": "==3.0.9" }, "pysocks": { "hashes": [ @@ -707,6 +716,14 @@ "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'", "version": "==2.8.2" }, + "python-slugify": { + "hashes": [ + "sha256:272d106cb31ab99b3496ba085e3fea0e9e76dcde967b5e9992500d1f785ce4e1", + "sha256:7b2c274c308b62f4269a9ba701aa69a797e9bca41aeee5b3a9e79e36b6656927" + ], + "index": "pypi", + "version": "==6.1.2" + }, "requests": { "hashes": [ "sha256:68d7c56fd5a8999887728ef304a6d12edc7be74f1cfa47714fc8b414525c9a61", @@ -730,6 +747,14 @@ ], "version": "==0.9.1" }, + "rich": { + "hashes": [ + "sha256:4c586de507202505346f3e32d1363eb9ed6932f0c2f63184dea88983ff4971e2", + "sha256:d2bbd99c320a2532ac71ff6a3164867884357da3e3301f0240090c5d2fdac7ec" + ], + "markers": "python_version < '4' and python_full_version >= '3.6.3'", + "version": "==12.4.4" + }, "rsa": { "hashes": [ "sha256:5c6bd9dc7a543b7fe4304a631f8a8a3b674e2bbfc49c2ae96200cdbe55df6b17", @@ -740,18 +765,18 @@ }, "s3transfer": { "hashes": [ - "sha256:7a6f4c4d1fdb9a2b640244008e142cbc2cd3ae34b386584ef044dd0f27101971", - "sha256:95c58c194ce657a5f4fb0b9e60a84968c808888aed628cd98ab8771fe1db98ed" + "sha256:06176b74f3a15f61f1b4f25a1fc29a4429040b7647133a463da8fa5bd28d5ecd", + "sha256:2ed07d3866f523cc561bf4a00fc5535827981b117dd7876f036b0c1aca42c947" ], - "markers": "python_version >= '3.6'", - "version": "==0.5.2" + "markers": "python_full_version >= '3.7.0'", + "version": "==0.6.0" }, "selenium": { "hashes": [ - "sha256:866b6dd6c459210662bff922ee7c33162d21920fbf6811e8e5a52be3866a687f" + "sha256:ba5b2633f43cf6fe9d308fa4a6996e00a101ab9cb1aad6fd91ae1f3dbe57f56f" ], "index": "pypi", - "version": "==4.1.5" + "version": "==4.2.0" }, "six": { "hashes": [ @@ -800,16 +825,34 @@ "index": "pypi", "version": "==1.24.0" }, + "text-unidecode": { + "hashes": [ + "sha256:1311f10e8b895935241623731c2ba64f4c455287888b18189350b67134a822e8", + "sha256:bad6603bb14d279193107714b288be206cac565dfa49aa5b105294dd5c4aab93" + ], + "version": "==1.3" + }, "tiktok-downloader": { - "git": "https://github.com/msramalho/tiktok-downloader", - "ref": "7bd8bb331d00ebdc317b8cc9c28ecbd83c89e03c" + "hashes": [ + "sha256:48fe204df962893a60360a20b13da133bc22bdbfec87c3cd3a9157f138785242" + ], + "index": "pypi", + "version": "==0.3.3" + }, + "tqdm": { + "hashes": [ + "sha256:40be55d30e200777a307a7585aee69e4eabb46b4ec6a4b4a5f2d9f11e7d5408d", + "sha256:74a2cdefe14d11442cedf3ba4e21a3b84ff9a2dbdc6cfae2c34addb2a14a5ea6" + ], + "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'", + "version": "==4.64.0" }, "trio": { "hashes": [ "sha256:670a52d3115d0e879e1ac838a4eb999af32f858163e3a704fe4839de2a676070", "sha256:fb2d48e4eab0dfb786a472cd514aaadc71e3445b203bc300bad93daa75d77c1a" ], - "markers": "python_version >= '3.7'", + "markers": "python_full_version >= '3.7.0'", "version": "==0.20.0" }, "trio-websocket": { @@ -829,6 +872,10 @@ "version": "==4.1.1" }, "urllib3": { + "extras": [ + "secure", + "socks" + ], "hashes": [ "sha256:44ece4d53fb1706f667c9bd1c648f5469a2ec925fcf3a776667042d645472c14", "sha256:aabaf16477806a5e1dd19aa41f8c2b7950dd3c746362d7e3223dbe6de6ac448e" @@ -887,7 +934,7 @@ "sha256:fab7c640815812ed5f10fbee7abbf58788d602046b7bb3af9b1ac753a6d5e916", "sha256:fc06cc8073c8e87072138ba1e431300e2d408f054b27047d047b549455066ff4" ], - "markers": "python_version >= '3.7'", + "markers": "python_full_version >= '3.7.0'", "version": "==10.3" }, "werkzeug": { @@ -895,7 +942,7 @@ "sha256:1ce08e8093ed67d638d63879fd1ba3735817f7a80de3674d293f5984f25fb6e6", "sha256:72a4b735692dd3135217911cbeaa1be5fa3f62bffb8745c5215420a03dc55255" ], - "markers": "python_version >= '3.7'", + "markers": "python_full_version >= '3.7.0'", "version": "==2.1.2" }, "wsproto": { @@ -903,23 +950,23 @@ "sha256:2218cb57952d90b9fca325c0dcfb08c3bda93e8fd8070b0a17f048e2e47a521b", "sha256:a2e56bfd5c7cd83c1369d83b5feccd6d37798b74872866e62616e0ecf111bda8" ], - "markers": "python_version >= '3.7'", + "markers": "python_full_version >= '3.7.0'", "version": "==1.1.0" }, "yt-dlp": { "hashes": [ - "sha256:6edefe326b1e1478fdbe627a66203e5248a6b0dd50c101e682cf700ab70cdf72", - "sha256:8758d016509d4574b90fbde975aa70adaef71ed5e7a195141588f6d6945205ba" + "sha256:3a7b59d2fb4b39ce8ba8e0b9c5a37fe20e5624f46a2346b4ae66ab1320e35134", + "sha256:deec1009442312c1e2ee5298966842194d0e950b433f0d4fc844ef464b9c32a7" ], "index": "pypi", - "version": "==2022.4.8" + "version": "==2022.5.18" }, "zipp": { "hashes": [ "sha256:56bf8aadb83c24db6c4b577e13de374ccfb67da2078beba1d037c17980bf43ad", "sha256:c4f6e5bbf48e74f7a38e7cc5b0480ff42b0ae5178957d564d18932525d5cf099" ], - "markers": "python_version >= '3.7'", + "markers": "python_full_version >= '3.7.0'", "version": "==3.8.0" } }, From 6f08cef4595500630159df6d07257b3b25e68d5c Mon Sep 17 00:00:00 2001 From: msramalho <19508417+msramalho@users.noreply.github.com> Date: Tue, 7 Jun 2022 18:41:38 +0200 Subject: [PATCH 30/84] example config --- example.config.json | 60 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 60 insertions(+) create mode 100644 example.config.json diff --git a/example.config.json b/example.config.json new file mode 100644 index 0000000..4f49832 --- /dev/null +++ b/example.config.json @@ -0,0 +1,60 @@ +{ + "secrets": { + "s3": {// for storage=s3 + "region": "s3 region like fra1", + "bucket": "s3 bucket name like my-bucket", + "key": "s3 API key", + "secret": "s3 API secret", + "endpoint_url": "use region format like such: https://{region}.digitaloceanspaces.com", + "cdn_url": "use bucket, region, and key (key is the archived file path generated when executing) format like such: https://{bucket}.{region}.cdn.digitaloceanspaces.com/{key}", + "private": false, // if true S3 urls will not be readable online + "key_path": "random" // you can generate a random UUID for the URL instead of a predictable path, useful to still have public but unlisted files + }, + "wayback": { + "key": "your API key, visit https://archive.org/account/s3.php", + "secret": "your API secret" + }, + "telegram": { + "api_id": "your API key, see https://telegra.ph/How-to-get-Telegram-APP-ID--API-HASH-05-27", + "api_hash": "your API hash" + }, + "google_sheets": { + "service_account": "normally service_account.json, see https://gspread.readthedocs.io/en/latest/oauth2.html#for-bots-using-service-account" + }, + "google_drive": { // for storage=gd + "service_account": "can be the same or different file from google_sheets defaults to service_account.json", + "root_folder_id": "copy XXXX from https://drive.google.com/drive/folders/XXXX" + }, + "local":{ // for storage=local + "save_to": "local path to save files in ./local_archive" + }, + "facebook": { + "cookie": "optional facebook cookie to have more access to content" + } + }, + "execution": { + "sheet": "your-sheet-name", // can be overwritten with CMD --sheet= + "header": 1, //which row of your tabs contains the header, can be overwritten with CMD --header= + "storage": "s3", // which storage to use, can be overwritten with CMD --storage= + "selenium": { // optional configurations for the selenium browser that takes screenshots, these are the defaults + "timeout_seconds": 120, // values under 10s might mean screenshots fail to grab + "window_width": 1400, + "window_height": 2000 + }, + "tmp_folder": "tmp/", // local tmp folder to save files before uploading to storage + "column_names": { // custom column names, only needed if different from default, can be overwritten with CMD --col-NAME="VALUE" + "url": "link", + "archive": "archive location", + "folder": "folder", + "date": "archive date", + "status": "archive status", + "thumbnail": "thumbnail", + "thumbnail_index": "thumbnail index", + "timestamp": "upload timestamp", + "title": "upload title", + "duration": "duration", + "screenshot": "screenshot", + "hash": "hash" + } + } +} \ No newline at end of file From d46b8e115767d9a135a33970128378d75ba4ae30 Mon Sep 17 00:00:00 2001 From: msramalho <19508417+msramalho@users.noreply.github.com> Date: Tue, 7 Jun 2022 18:41:43 +0200 Subject: [PATCH 31/84] README updates --- README.md | 134 +++++++++++++++++++++++++++++++++++------------------- 1 file changed, 88 insertions(+), 46 deletions(-) diff --git a/README.md b/README.md index 2fbff7d..7b7341d 100644 --- a/README.md +++ b/README.md @@ -6,45 +6,103 @@ This Python script will look for links to Youtube, Twitter, etc,. in a specified If you are using `pipenv` (recommended), `pipenv install` is sufficient to install Python prerequisites. -[A Google Service account is necessary for use with `gspread`.](https://gspread.readthedocs.io/en/latest/oauth2.html#for-bots-using-service-account) Credentials for this account should be stored in `service_account.json`, in the same directory as the script. +You also need: +1. [A Google Service account is necessary for use with `gspread`.](https://gspread.readthedocs.io/en/latest/oauth2.html#for-bots-using-service-account) Credentials for this account should be stored in `service_account.json`, in the same directory as the script. +1. [ffmpeg](https://www.ffmpeg.org/) must also be installed locally for this tool to work. +1. [firefox](https://www.mozilla.org/en-US/firefox/new/) and [geckodriver](https://github.com/mozilla/geckodriver/releases) on a path folder like `/usr/local/bin`. +1. [fonts-noto](https://fonts.google.com/noto) to deal with multiple unicode characters during selenium/geckodriver's screenshots: `sudo apt install fonts-noto -y`. +1. Internet Archive credentials can be retrieved from https://archive.org/account/s3.php. -[ffmpeg](https://www.ffmpeg.org/) must also be installed locally for this tool to work. +### Configuration file +Configuration is done via a config.json file (see [example.config.json](example.config.json)) and some properties of that file can be overwritten via command line arguments. Here is the current result from running the `python auto_archive.py --help`: -[firefox](https://www.mozilla.org/en-US/firefox/new/) and [geckodriver](https://github.com/mozilla/geckodriver/releases) on a path folder like `/usr/local/bin`. +
python auto_archive.py --help -[fonts-noto](https://fonts.google.com/noto) to deal with multiple unicode characters during selenium/geckodriver's screenshots: `sudo apt install fonts-noto -y`. -A `.env` file is required for saving content to a Digital Ocean space and Google Drive, and for archiving pages to the Internet Archive. This file should also be in the script directory, and should contain the following variables: -``` -DO_SPACES_REGION= -DO_BUCKET= -DO_SPACES_KEY= -DO_SPACES_SECRET= -INTERNET_ARCHIVE_S3_KEY= -INTERNET_ARCHIVE_S3_SECRET= -TELEGRAM_API_ID= -TELEGRAM_API_HASH= -FACEBOOK_COOKIE= -GD_ROOT_FOLDER_ID= +```js +usage: auto_archive.py [-h] [--config CONFIG] [--storage {s3,local,gd}] [--sheet SHEET] [--header HEADER] [--s3-private] [--col-url URL] [--col-folder FOLDER] [--col-archive ARCHIVE] [--col-date DATE] [--col-status STATUS] [--col-thumbnail THUMBNAIL] [--col-thumbnail_index THUMBNAIL_INDEX] [--col-timestamp TIMESTAMP] [--col-title TITLE] [--col-duration DURATION] [--col-screenshot SCREENSHOT] [--col-hash HASH] + +Automatically archive social media posts, videos, and images from a Google Sheets document. The command line arguments will always override the configurations in the provided JSON config +file (--config), only some high-level options are allowed via the command line and the JSON configuration file is the preferred method. + +optional arguments: + -h, --help show this help message and exit + --config CONFIG the filename of the JSON configuration file (defaults to 'config.json') + --storage {s3,local,gd} + which storage to use [execution.storage in config.json] + --sheet SHEET the name of the google sheets document [execution.sheet in config.json] + --header HEADER 1-based index for the header row [execution.header in config.json] + --s3-private Store content without public access permission (only for storage=s3) [secrets.s3.private in config.json] + --col-url URL the name of the column to READ url FROM (default='link') + --col-folder FOLDER the name of the column to READ folder FROM (default='destination folder') + --col-archive ARCHIVE + the name of the column to FILL WITH archive (default='archive location') + --col-date DATE the name of the column to FILL WITH date (default='archive date') + --col-status STATUS the name of the column to FILL WITH status (default='archive status') + --col-thumbnail THUMBNAIL + the name of the column to FILL WITH thumbnail (default='thumbnail') + --col-thumbnail_index THUMBNAIL_INDEX + the name of the column to FILL WITH thumbnail_index (default='thumbnail index') + --col-timestamp TIMESTAMP + the name of the column to FILL WITH timestamp (default='upload timestamp') + --col-title TITLE the name of the column to FILL WITH title (default='upload title') + --col-duration DURATION + the name of the column to FILL WITH duration (default='duration') + --col-screenshot SCREENSHOT + the name of the column to FILL WITH screenshot (default='screenshot') + --col-hash HASH the name of the column to FILL WITH hash (default='hash') ``` -`.example.env` is an example of this file +

+ +#### Example invocations +All the configurations can be specified in the JSON config file, but sometimes it is useful to override only some of those like the sheet that we are running the archival on, here are some examples (possibly prepended by `pipenv run`): + +```bash +# all the configurations come from config.json +python auto_archive.py + +# all the configurations come from my_config.json +python auto_archive.py --config my_config.json + +# reads the configurations but saves archived content to google drive instead +python auto_archive.py --config my_config.json --storage gd + +# uses the configurations but for another google docs sheet +# with a header on row 2 and with some different column names +python auto_archive.py --config my_config.json --sheet="use it on another sheets doc" --header=2 --col-link="put urls here" + +# all the configurations come from config.json and specifies that s3 files should be private +python auto_archive.py --s3-private +``` + +### Extra notes on configuration +#### Google Drive +To use Google Drive storage you need the id of the shared folder in the `config.json` file which must be shared with the service account eg `autoarchiverservice@auto-archiver-111111.iam.gserviceaccount.com` and then you can use `--storage=gd` + +#### Telethon (Telegrams API Library) +The first time you run, you will be prompted to do a authentication with the phone number associated, alternativelly you can put your `anon.session` in the root. -Internet Archive credentials can be retrieved from https://archive.org/account/s3.php. ## Running - -There is just one necessary command line flag, `--sheet name` which the name of the Google Sheet to check for URLs. This sheet must have been shared with the Google Service account used by `gspread`. This sheet must also have specific columns in the first row: -* `Media URL` (required): the location of the media to be archived. This is the only column that should be supplied with data initially +The `--sheet name` property (or `execution.sheet` in the JSON file) is the name of the Google Sheet to check for URLs. +This sheet must have been shared with the Google Service account used by `gspread`. +This sheet must also have specific columns (case-insensitive) in the `header` row (see `COLUMN_NAMES` in [gworksheet.py](utils/gworksheet.py)): +* `Link` (required): the location of the media to be archived. This is the only column that should be supplied with data initially +* `Destination folder`: (optional) by default files are saved to a folder called `name-of-sheets-document/name-of-sheets-tab/` using this option you can organize documents into folder from the sheet. * `Archive status` (required): the status of the auto archiver script. Any row with text in this column will be skipped automatically. -* `Archive location` (required): the location of the archived version. For files that were not able to be auto archived, this can be manually updated. +* `Archive location`: the location of the archived version. For files that were not able to be auto archived, this can be manually updated. * `Archive date`: the date that the auto archiver script ran for this file * `Upload timestamp`: the timestamp extracted from the video. (For YouTube, this unfortunately does not currently include the time) -* `Duration`: the duration of the video * `Upload title`: the "title" of the video from the original source -* `Thumbnail`: an image thumbnail of the video (resize row height to make this more visible) -* `Thumbnail index`: a link to a page that shows many thumbnails for the video, useful for quickly seeing video content +* `Hash`: a hash of the first video or image found +* `Screenshot`: a screenshot taken with from a browser view of opening the page +* in case of videos + * `Duration`: duration in seconds + * `Thumbnail`: an image thumbnail of the video (resize row height to make this more visible) + * `Thumbnail index`: a link to a page that shows many thumbnails for the video, useful for quickly seeing video content + For example, for use with this spreadsheet: @@ -88,11 +146,12 @@ Code is split into functional concepts: ### Current Archivers ```mermaid graph TD - A(Archiver) -->|parent of| B(TelegramArchiver) + A(Archiver) -->|parent of| B(YoutubeDLArchiver) A -->|parent of| C(TikTokArchiver) - A -->|parent of| D(YoutubeDLArchiver) - A -->|parent of| E(WaybackArchiver) - A -->|parent of| F(TwitterArchiver) + A -->|parent of| D(TwitterArchiver) + A -->|parent of| E(TelegramArchiver) + A -->|parent of| F(TelethonArchiver) + A -->|parent of| G(WaybackArchiver) ``` ### Current Storages ```mermaid @@ -102,22 +161,5 @@ graph TD A(BaseStorage) -->|parent of| C(GoogleDriveStorage) ``` -## Saving into Subfolders - -You can have a column in the spreadsheet for the argument `--col-subfolder` that is passed to the storage and can specify a subfolder to put the archived link into. - -## Google Drive - -To use Google Drive storage you need the id of the shared folder in the `.env` file which must be shared with the service account eg `autoarchiverservice@auto-archiver-111111.iam.gserviceaccount.com` - -```bash -python auto_archive.py --sheet 'Sheet Name' --storage='gd' -``` - -## Telethon (Telegrams API Library) - -Put your `anon.session` in the root, so that it doesn't stall and ask for authentication - - From f87acb6d1da7dceceededf983ca63abbf7a89b4f Mon Sep 17 00:00:00 2001 From: msramalho <19508417+msramalho@users.noreply.github.com> Date: Tue, 7 Jun 2022 18:41:58 +0200 Subject: [PATCH 32/84] refactor --- .gitignore | 3 +- archivers/base_archiver.py | 4 ++ archivers/tiktok_archiver.py | 7 ++- archivers/wayback_archiver.py | 49 ++++++++-------- auto_archive.py | 40 +++++++------ configs/config.py | 68 +++++++++++++--------- storages/__init__.py | 2 +- storages/base_storage.py | 18 +----- storages/gd_storage.py | 105 +++++++++++++++++----------------- storages/local_storage.py | 21 +++++-- storages/s3_storage.py | 4 +- utils/gworksheet.py | 11 ++-- utils/misc.py | 12 +++- 13 files changed, 187 insertions(+), 157 deletions(-) diff --git a/.gitignore b/.gitignore index d15b3e8..9084d64 100644 --- a/.gitignore +++ b/.gitignore @@ -12,4 +12,5 @@ anu.html anon* config.json config-*.json -logs/* \ No newline at end of file +logs/* +local_archive/ \ No newline at end of file diff --git a/archivers/base_archiver.py b/archivers/base_archiver.py index f7d915f..248d869 100644 --- a/archivers/base_archiver.py +++ b/archivers/base_archiver.py @@ -35,6 +35,9 @@ class Archiver(ABC): def __str__(self): return self.__class__.__name__ + def __repr__(self): + return self.__str__() + @abstractmethod def download(self, url, check_if_exists=False): pass @@ -134,6 +137,7 @@ class Archiver(ABC): return hash.hexdigest() def get_screenshot(self, url): + logger.debug(f"getting screenshot for {url=}") key = self.get_key(urlparse(url).path.replace( "/", "_") + datetime.datetime.utcnow().isoformat().replace(" ", "_") + ".png") filename = Storage.TMP_FOLDER + key diff --git a/archivers/tiktok_archiver.py b/archivers/tiktok_archiver.py index f96ad59..c132886 100644 --- a/archivers/tiktok_archiver.py +++ b/archivers/tiktok_archiver.py @@ -18,8 +18,8 @@ class TiktokArchiver(Archiver): try: info = tiktok_downloader.info_post(url) key = self.get_key(f'{info.id}.mp4') - cdn_url = self.storage.get_cdn_url(key) filename = Storage.TMP_FOLDER + key + logger.info(f'found video {key=}') if check_if_exists and self.storage.exists(key): status = 'already archived' @@ -28,13 +28,15 @@ class TiktokArchiver(Archiver): if len(media) <= 0: if status == 'already archived': - return ArchiveResult(status='Could not download media, but already archived', cdn_url=cdn_url) + return ArchiveResult(status='Could not download media, but already archived', cdn_url=self.storage.get_cdn_url(key)) else: return ArchiveResult(status='Could not download media') + logger.info(f'downloading video {key=}') media[0].download(filename) if status != 'already archived': + logger.info(f'uploading video {key=}') self.storage.upload(filename, key) try: @@ -50,6 +52,7 @@ class TiktokArchiver(Archiver): try: os.remove(filename) except FileNotFoundError: logger.info(f'tmp file not found thus not deleted {filename}') + cdn_url = self.storage.get_cdn_url(key) return ArchiveResult(status=status, cdn_url=cdn_url, thumbnail=key_thumb, thumbnail_index=thumb_index, duration=info.duration, title=info.caption, timestamp=info.create.isoformat(), diff --git a/archivers/wayback_archiver.py b/archivers/wayback_archiver.py index 72448ff..7f90c0a 100644 --- a/archivers/wayback_archiver.py +++ b/archivers/wayback_archiver.py @@ -8,26 +8,31 @@ from .base_archiver import Archiver, ArchiveResult from configs import WaybackConfig - class WaybackArchiver(Archiver): name = "wayback" def __init__(self, storage: Storage, driver, config: WaybackConfig): super(WaybackArchiver, self).__init__(storage, driver) self.config = config + # TODO: this logic should live at the auto-archiver level self.seen_urls = {} def download(self, url, check_if_exists=False): - if check_if_exists and url in self.seen_urls: - return self.seen_urls[url] + if check_if_exists: + if url in self.seen_urls: return self.seen_urls[url] + logger.debug(f"checking if {url=} already on archive.org") + archive_url = f"https://web.archive.org/web/{url}" + req = requests.get(archive_url) + if req.status_code == 200: + return self.if_archived_return_with_screenshot(url, archive_url, req=req, status='already archived') + + logger.debug(f"POSTing {url=} to web.archive.org") ia_headers = { "Accept": "application/json", "Authorization": f"LOW {self.config.key}:{self.config.secret}" } - - r = requests.post( - 'https://web.archive.org/save/', headers=ia_headers, data={'url': url}) + r = requests.post('https://web.archive.org/save/', headers=ia_headers, data={'url': url}) if r.status_code != 200: logger.warning(f"Internet archive failed with status of {r.status_code}") @@ -38,47 +43,41 @@ class WaybackArchiver(Archiver): return ArchiveResult(status=f"Internet archive failed: {r.json()['message']}") job_id = r.json()['job_id'] - - status_r = requests.get('https://web.archive.org/save/status/' + job_id, headers=ia_headers) - + logger.debug(f"GETting status for {job_id=} on {url=}") + status_r = requests.get(f'https://web.archive.org/save/status/{job_id}', headers=ia_headers) retries = 0 + # TODO: make the job queue parallel -> consider propagation of results back to sheet though # wait 90-120 seconds for the archive job to finish while (status_r.status_code != 200 or status_r.json()['status'] == 'pending') and retries < 30: time.sleep(3) - try: - status_r = requests.get( - 'https://web.archive.org/save/status/' + job_id, headers=ia_headers) + logger.debug(f"GETting status for {job_id=} on {url=} [{retries=}]") + status_r = requests.get(f'https://web.archive.org/save/status/{job_id}', headers=ia_headers) except: time.sleep(1) - retries += 1 if status_r.status_code != 200: return ArchiveResult(status="Internet archive failed") status_json = status_r.json() - if status_json['status'] != 'success': return ArchiveResult(status='Internet Archive failed: ' + str(status_json)) - archive_url = 'https://web.archive.org/web/' + \ - status_json['timestamp'] + '/' + status_json['original_url'] + archive_url = f"https://web.archive.org/web/{status_json['timestamp']}/{status_json['original_url']}" + return self.if_archived_return_with_screenshot(archive_url) + def if_archived_return_with_screenshot(self, url, archive_url, req=None, status='success'): try: - r = requests.get(archive_url) - - parsed = BeautifulSoup(r.content, 'html.parser') - + if req is None: + req = requests.get(archive_url) + parsed = BeautifulSoup(req.content, 'html.parser') title = parsed.find_all('title')[0].text - if title == 'Wayback Machine': title = 'Could not get title' except: title = "Could not get title" - screenshot = self.get_screenshot(url) - result = ArchiveResult(status='success', cdn_url=archive_url, title=title, screenshot=screenshot) - self.seen_urls[url] = result - return result + self.seen_urls[url] = ArchiveResult(status=status, cdn_url=archive_url, title=title, screenshot=screenshot) + return self.seen_urls[url] diff --git a/auto_archive.py b/auto_archive.py index a0f8883..75b105d 100644 --- a/auto_archive.py +++ b/auto_archive.py @@ -1,11 +1,12 @@ -import datetime -import shutil -import traceback +import os, datetime, shutil, traceback + from loguru import logger +from slugify import slugify from archivers import TelethonArchiver, TelegramArchiver, TiktokArchiver, YoutubeDLArchiver, TwitterArchiver, WaybackArchiver, ArchiveResult from utils import GWorksheet, mkdir_if_not_exists, expand_url from configs import Config +from storages import Storage def update_sheet(gw, row, result: ArchiveResult): @@ -42,12 +43,12 @@ def update_sheet(gw, row, result: ArchiveResult): def missing_required_columns(gw: GWorksheet): - required_found = True + missing = False for required_col in ['url', 'status']: if not gw.col_exists(required_col): - logger.warning(f'Required column for {required_col}: "{gw.columns[required_col]}" not found, skipping worksheet {gw.worksheet.title}') - required_found = False - return required_found + logger.warning(f'Required column for {required_col}: "{gw.columns[required_col]}" not found, skipping worksheet {gw.wks.title}') + missing = True + return missing def process_sheet(c: Config): @@ -60,9 +61,9 @@ def process_sheet(c: Config): if missing_required_columns(gw): continue - # archives will be in a folder 'doc_name/worksheet_name' - # TODO: use slugify lib - c.set_folder(f'{c.sheet.replace(" ", "_")}/{wks.title.replace(" ", "_")}/') + # archives will default to being in a folder 'doc_name/worksheet_name' + default_folder = os.path.join(slugify(c.sheet), slugify(wks.title)) + c.set_folder(default_folder) storage = c.get_storage() # loop through rows in worksheet @@ -76,7 +77,7 @@ def process_sheet(c: Config): # All checks done - archival process starts here gw.set_cell(row, 'status', 'Archive in progress') url = expand_url(url) - storage.update_properties(subfolder=gw.get_cell_or_default(row, 'subfolder')) + c.set_folder(gw.get_cell_or_default(row, 'folder', default_folder, when_empty_use_default=True)) # make a new driver so each spreadsheet row is idempotent c.recreate_webdriver() @@ -92,26 +93,27 @@ def process_sheet(c: Config): ] for archiver in active_archivers: - logger.debug(f'Trying {archiver=} on {row=}') + logger.debug(f'Trying {archiver} on {row=}') try: result = archiver.download(url, check_if_exists=True) except KeyboardInterrupt: # catches keyboard interruptions to do a clean exit - logger.warning(f"caught interrupt for {archiver=} on {row=}") + logger.warning(f"caught interrupt for {archiver} on {row=}") gw.set_cell(row, 'status', '') c.destroy_webdriver() exit() except Exception as e: result = False - logger.error(f'Got unexpected error in row {row} with {archiver=} for {url=}: {e}\n{traceback.format_exc()}') + logger.error(f'Got unexpected error in row {row} with {archiver.name} for {url=}: {e}\n{traceback.format_exc()}') if result: + success = result.status in ['success', 'already archived'] result.status = f"{archiver.name}: {result.status}" - if result.status in ['success', 'already archived']: - logger.success(f'{archiver=} succeeded on {row=}, {url=}') + if success: + logger.success(f'{archiver.name} succeeded on {row=}, {url=}') break - logger.warning(f'{archiver} did not succeed on {row=}, final status: {result.status}') + logger.warning(f'{archiver.name} did not succeed on {row=}, final status: {result.status}') if result: update_sheet(gw, row, result) @@ -125,10 +127,10 @@ def main(): c = Config() c.parse() logger.info(f'Opening document {c.sheet} for header {c.header}') - mkdir_if_not_exists(c.tmp_folder) + mkdir_if_not_exists(Storage.TMP_FOLDER) process_sheet(c) c.destroy_webdriver() - shutil.rmtree(c.tmp_folder) + shutil.rmtree(Storage.TMP_FOLDER) if __name__ == '__main__': diff --git a/configs/config.py b/configs/config.py index f90eb96..23c0be1 100644 --- a/configs/config.py +++ b/configs/config.py @@ -3,12 +3,12 @@ import argparse, json import gspread from loguru import logger from selenium import webdriver -from dataclasses import dataclass +from dataclasses import dataclass, asdict -from utils.gworksheet import GWorksheet +from utils import GWorksheet, getattr_or from .wayback_config import WaybackConfig from .telethon_config import TelethonConfig -from storages import Storage, S3Config, S3Storage, GDStorage, GDConfig, LocalStorage +from storages import Storage, S3Config, S3Storage, GDStorage, GDConfig, LocalStorage, LocalConfig @dataclass @@ -39,6 +39,7 @@ class Config: self.set_log_files() def set_log_files(self): + # TODO: isolate to config logger.add("logs/1trace.log", level="TRACE") logger.add("logs/2info.log", level="INFO") logger.add("logs/3success.log", level="SUCCESS") @@ -59,21 +60,18 @@ class Config: # ----------------------EXECUTION - execution configurations execution = self.config.get("execution", {}) - self.sheet = getattr(self.args, "sheet", execution.get("sheet")) + self.sheet = getattr_or(self.args, "sheet", execution.get("sheet")) assert self.sheet is not None, "'sheet' must be provided either through command line or configuration file" - self.header = int(getattr(self.args, "header", execution.get("header", 1))) + self.header = int(getattr_or(self.args, "header", execution.get("header", 1))) Storage.TMP_FOLDER = execution.get("tmp_folder", Storage.TMP_FOLDER) - self.storage = getattr(self.args, "storage", execution.get("storage", "s3")) - - for key, name in [("s3", "s3"), ("gd", "google_drive")]: - assert self.storage != key or name in secrets, f"selected storage '{key}' requires secrets.'{name}' in {self.config_file}" + self.storage = getattr_or(self.args, "storage", execution.get("storage", "s3")) # Column names come from config and can be overwritten by CMD # in the end all are considered as lower case config_column_names = execution.get("column_names", {}) self.column_names = {} for k in GWorksheet.COLUMN_NAMES.keys(): - self.column_names[k] = getattr(self.args, k, config_column_names.get(k, GWorksheet.COLUMN_NAMES[k])).lower() + self.column_names[k] = getattr_or(self.args, k, config_column_names.get(k, GWorksheet.COLUMN_NAMES[k])).lower() # selenium driver selenium_configs = execution.get("selenium", {}) @@ -87,6 +85,10 @@ class Config: # ---------------------- SECRETS - APIs and service configurations secrets = self.config.get("secrets", {}) + # assert selected storage credentials exist + for key, name in [("s3", "s3"), ("gd", "google_drive"), ("local", "local")]: + assert self.storage != key or name in secrets, f"selected storage '{key}' requires secrets.'{name}' in {self.config_file}" + # google sheets config self.gsheets_client = gspread.service_account( filename=secrets.get("google_sheets", {}).get("service_account", 'service_account.json') @@ -106,8 +108,7 @@ class Config: endpoint_url=s3.get("endpoint_url", S3Config.endpoint_url), cdn_url=s3.get("cdn_url", S3Config.cdn_url), key_path=s3.get("key_path", S3Config.key_path), - private=getattr(self.args, "s3-private", s3.get("private", S3Config.private)), - no_folder=s3.get("no_folder", S3Config.no_folder), + private=getattr_or(self.args, "s3-private", s3.get("private", S3Config.private)) ) # GDrive config @@ -115,8 +116,12 @@ class Config: gd = secrets["google_drive"] self.gd_config = GDConfig( root_folder_id=gd.get("root_folder_id"), - default_folder=gd.get("default_folder", GDConfig.default_folder), - service_account=gd.get("service_account", GDConfig.service_account), + service_account=gd.get("service_account", GDConfig.service_account) + ) + + if "local" in secrets: + self.local_config = LocalConfig( + save_to=secrets["local"].get("save_to", LocalConfig.save_to), ) # wayback machine config @@ -153,30 +158,40 @@ class Config: for k, v in GWorksheet.COLUMN_NAMES.items(): help = f"the name of the column to FILL WITH {k} (default='{v}')" - if k in ["url", "subfolder"]: + if k in ["url", "folder"]: help = f"the name of the column to READ {k} FROM (default='{v}')" parser.add_argument(f'--col-{k}', action='store', dest=k, help=help) return parser def set_folder(self, folder): - # update the folder in each of the storages + """ + update the folder in each of the storages + """ self.folder = folder - if self.s3_config: - self.s3_config.folder = folder - if self.gd_config: - self.gd_config.default_folder = folder + # s3 + if hasattr(self, "s3_config"): self.s3_config.folder = folder + if hasattr(self, "s3_storage"): self.s3_storage.folder = folder + # gdrive + if hasattr(self, "gd_config"): self.gd_config.folder = folder + if hasattr(self, "gd_storage"): self.gd_storage.folder = folder + # local + if hasattr(self, "local_config"): self.local_config.folder = folder + if hasattr(self, "local_storage"): self.local_storage.folder = folder def get_storage(self): """ - creates and returns the configured type of storage + returns the configured type of storage, creating if needed """ if self.storage == "s3": - return S3Storage(self.s3_config) + self.s3_storage = getattr_or(self, "s3_storage", S3Storage(self.s3_config)) + return self.s3_storage elif self.storage == "gd": - return GDStorage(self.gd_config) + self.gd_storage = getattr_or(self, "gd_storage", GDStorage(self.gd_config)) + return self.gd_storage elif self.storage == "local": - return LocalStorage(self.folder) + self.local_storage = getattr_or(self, "local_storage", LocalStorage(self.local_config)) + return self.local_storage raise f"storage {self.storage} not implemented, available: {Config.AVAILABLE_STORAGES}" def destroy_webdriver(self): @@ -197,12 +212,13 @@ class Config: return json.dumps({ "config_file": self.config_file, "sheet": self.sheet, + "storage": self.storage, "header": self.header, "tmp_folder": Storage.TMP_FOLDER, - "selenium_config": self.selenium_config, + "selenium_config": asdict(self.selenium_config), "selenium_webdriver": self.webdriver != None, "s3_config": self.s3_config != None, - "s3_private": getattr(self.s3_config, "private", None), + "s3_private": getattr_or(self.s3_config, "private", None), "wayback_config": self.wayback_config != None, "telegram_config": self.telegram_config != None, "gsheets_client": self.gsheets_client != None, diff --git a/storages/__init__.py b/storages/__init__.py index 773c0b3..99f82b3 100644 --- a/storages/__init__.py +++ b/storages/__init__.py @@ -1,5 +1,5 @@ # we need to explicitly expose the available imports here from .base_storage import Storage -from .local_storage import LocalStorage +from .local_storage import LocalStorage, LocalConfig from .s3_storage import S3Config, S3Storage from .gd_storage import GDConfig, GDStorage \ No newline at end of file diff --git a/storages/base_storage.py b/storages/base_storage.py index 3d9e361..5c68f82 100644 --- a/storages/base_storage.py +++ b/storages/base_storage.py @@ -23,23 +23,7 @@ class Storage(ABC): with open(filename, 'rb') as f: self.uploadf(f, key, **kwargs) - def update_properties(self, **kwargs): - """ - method used to update general properties that some children may use - and others not, but that all can call - """ - for k, v in kwargs.items(): - if k in self._get_allowed_properties(): - setattr(self, k, v) - else: - logger.warning(f'[{self.__class__.__name__}] does not accept dynamic property "{k}"') - - def _get_allowed_properties(self): - """ - child classes should specify which properties they allow to be set - """ - return set(["subfolder"]) - + #TODO: is this really necessary if only use os.path operations def _clean_path(self, folder, default="", add_forward_slash=True): if folder is None or type(folder) != str or len(folder.strip()) == 0: return default diff --git a/storages/gd_storage.py b/storages/gd_storage.py index f4f2820..f8efc32 100644 --- a/storages/gd_storage.py +++ b/storages/gd_storage.py @@ -1,24 +1,23 @@ +import os, time + from loguru import logger from .base_storage import Storage from dataclasses import dataclass - from googleapiclient.discovery import build from googleapiclient.http import MediaFileUpload from google.oauth2 import service_account -import time - @dataclass class GDConfig: root_folder_id: str - default_folder: str = "default" + folder: str = "default" service_account: str = "service_account.json" class GDStorage(Storage): def __init__(self, config: GDConfig): - self.default_folder = config.default_folder + self.folder = config.folder self.root_folder_id = config.root_folder_id creds = service_account.Credentials.from_service_account_file( config.service_account, scopes=['https://www.googleapis.com/auth/drive']) @@ -29,77 +28,73 @@ class GDStorage(Storage): only support files saved in a folder for GD S3 supports folder and all stored in the root """ - self.subfolder = self._clean_path(self.subfolder, self.default_folder, False) - filename = key - logger.debug(f'Looking for {self.subfolder} and filename: {filename} on GD') - - folder_id = self._get_id_from_parent_and_name(self.root_folder_id, self.subfolder, 5, 10) - - # check for sub folder in file youtube_dl_abcde/index.html, needed for thumbnails - # a='youtube_dl_abcde', b='index.html' - a, _, b = filename.partition('/') - if b != '': - logger.debug(f'get_cdn_url: Found a subfolder so need to split on: {a=} and {b=}') - folder_id = self._get_id_from_parent_and_name(folder_id, a, use_mime_type=True) - filename = b + full_name = os.path.join(self.folder, key) + parent_id, folder_id = self.root_folder_id, None + path_parts = full_name.split(os.path.sep) + filename = path_parts[-1] + logger.info(f"looking for folders for {path_parts=} before uploading {filename=}") + for folder in path_parts[0:-1]: + folder_id = self._get_id_from_parent_and_name(parent_id, folder, use_mime_type=True, raise_on_missing=True) + parent_id = folder_id # get id of file inside folder (or sub folder) file_id = self._get_id_from_parent_and_name(folder_id, filename) return f"https://drive.google.com/file/d/{file_id}/view?usp=sharing" - def exists(self, _key): - # TODO: How to check for google drive, as it accepts different names? - return False + def exists(self, key): + try: + self.get_cdn_url(key) + return True + except: return False - def uploadf(self, file, key, **_kwargs): + def uploadf(self, file: str, key: str, **_kwargs): """ - 1. check if subfolder exists or create it - 2. check if key contains sub-subfolder, check if exists or create it - 3. upload file to root_id/subfolder[/sub-subfolder]/filename + 1. for each sub-folder in the path check if exists or create + 2. upload file to root_id/other_paths.../filename """ - self.subfolder = self._clean_path(self.subfolder, GDStorage.DEFAULT_UPLOAD_FOLDER_NAME, False) - filename = key - - # get id of subfolder or create if it does not exist - folder_id_to_upload_to = self._get_id_from_parent_and_name(self.root_folder_id, self.subfolder, use_mime_type=True, raise_on_missing=False) - if folder_id_to_upload_to is None: - folder_id_to_upload_to = self._mkdir(self.subfolder, self.root_folder_id) - - # check for sub folder in file youtube_dl_abcde/index.html, needed for thumbnails - # a='youtube_dl_abcde', b='index.html' - a, _, b = filename.partition('/') - if b != '': - logger.debug(f'uploadf: Found a subfolder so need to split on: {a=} and {b=}') - # get id of subfolder or create if it does not exist - sub_folder_id_to_upload_to = self._get_id_from_parent_and_name(folder_id_to_upload_to, a, use_mime_type=True, raise_on_missing=False) - if sub_folder_id_to_upload_to is None: - sub_folder_id_to_upload_to = self._mkdir(a, folder_id_to_upload_to) - - filename = b - folder_id_to_upload_to = sub_folder_id_to_upload_to + full_name = os.path.join(self.folder, key) + parent_id, upload_to = self.root_folder_id, None + path_parts = full_name.split(os.path.sep) + filename = path_parts[-1] + logger.info(f"checking folders {path_parts[0:-1]} exist (or creating) before uploading {filename=}") + for folder in path_parts[0:-1]: + upload_to = self._get_id_from_parent_and_name(parent_id, folder, use_mime_type=True, raise_on_missing=False) + if upload_to is None: + upload_to = self._mkdir(folder, parent_id) + parent_id = upload_to # upload file to gd + logger.debug(f'uploading {filename=} to folder id {upload_to}') file_metadata = { 'name': [filename], - 'parents': [folder_id_to_upload_to] + 'parents': [upload_to] } media = MediaFileUpload(file, resumable=True) gd_file = self.service.files().create(body=file_metadata, media_body=media, fields='id').execute() - logger.debug(f'uploadf: uploaded file {gd_file["id"]} succesfully in folder={folder_id_to_upload_to}') + logger.debug(f'uploadf: uploaded file {gd_file["id"]} succesfully in folder={upload_to}') def upload(self, filename: str, key: str, **kwargs): # GD only requires the filename not a file reader - logger.debug(f'[{self.__class__.__name__}] uploading file {filename} with key {key}') self.uploadf(filename, key, **kwargs) - def _get_id_from_parent_and_name(self, parent_id: str, name: str, retries: int = 1, sleep_seconds: int = 10, use_mime_type: bool = False, raise_on_missing: bool = True): + def _get_id_from_parent_and_name(self, parent_id: str, name: str, retries: int = 1, sleep_seconds: int = 10, use_mime_type: bool = False, raise_on_missing: bool = True, use_cache=True): """ Retrieves the id of a folder or file from its @name and the @parent_id folder Optionally does multiple @retries and sleeps @sleep_seconds between them If @use_mime_type will restrict search to "mimeType='application/vnd.google-apps.folder'" If @raise_on_missing will throw error when not found, or returns None + Will remember previous calls to avoid duplication if @use_cache Returns the id of the file or folder from its name as a string """ + # cache logic + if use_cache: + self.api_cache = getattr(self, "api_cache", {}) + cache_key = f"{parent_id}_{name}_{use_mime_type}" + if cache_key in self.api_cache: + logger.debug(f"cache hit for {cache_key=}") + return self.api_cache[cache_key] + + # API logic debug_header: str = f"[searching {name=} in {parent_id=}]" query_string = f"'{parent_id}' in parents and name = '{name}' " if use_mime_type: @@ -115,10 +110,14 @@ class GDStorage(Storage): if len(items) > 0: logger.debug(f"{debug_header} found {len(items)} matches, returning last of {','.join([i['id'] for i in items])}") - return items[-1]['id'] + _id = items[-1]['id'] + if use_cache: self.api_cache[cache_key] = _id + return _id else: - logger.debug(f'{debug_header} not found, attempt {attempt+1}/{retries}. sleeping for {sleep_seconds} second(s)') - if attempt < retries - 1: time.sleep(sleep_seconds) + logger.debug(f'{debug_header} not found, attempt {attempt+1}/{retries}.') + if attempt < retries - 1: + logger.debug(f'sleeping for {sleep_seconds} second(s)') + time.sleep(sleep_seconds) if raise_on_missing: raise ValueError(f'{debug_header} not found after {retries} attempt(s)') @@ -129,7 +128,7 @@ class GDStorage(Storage): Creates a new GDrive folder @name inside folder @parent_id Returns id of the created folder """ - logger.debug(f'[_mkdir] Creating new folder with {name=} inside {parent_id=}') + logger.debug(f'Creating new folder with {name=} inside {parent_id=}') file_metadata = { 'name': [name], 'mimeType': 'application/vnd.google-apps.folder', diff --git a/storages/local_storage.py b/storages/local_storage.py index f93446b..cef9a42 100644 --- a/storages/local_storage.py +++ b/storages/local_storage.py @@ -1,13 +1,26 @@ import os -from .base_storage import Storage +from dataclasses import dataclass + +from .base_storage import Storage +from utils import mkdir_if_not_exists + + +@dataclass +class LocalConfig: + folder: str = "" + save_to: str = "./" class LocalStorage(Storage): - def __init__(self, folder): - self.folder = self._clean_path(folder) + def __init__(self, config:LocalConfig): + self.folder = self._clean_path(config.folder) + self.save_to = self._clean_path(config.save_to) + mkdir_if_not_exists(self.save_to) def get_cdn_url(self, key): - return self.folder + self._clean_path(self.subfolder) + key + full_path = os.path.join(self.save_to, self.folder, key) + mkdir_if_not_exists(os.path.join(*full_path.split(os.path.sep)[0:-1])) + return os.path.abspath(full_path) def exists(self, key): return os.path.isfile(self.get_cdn_url(key)) diff --git a/storages/s3_storage.py b/storages/s3_storage.py index 5e882b3..f9922a1 100644 --- a/storages/s3_storage.py +++ b/storages/s3_storage.py @@ -20,8 +20,6 @@ class S3Config: cdn_url: str = "https://{bucket}.{region}.cdn.digitaloceanspaces.com/{key}" private: bool = False key_path: str = "default" # 'default' uses full naming, 'random' uses generated uuid - no_folder: bool = False # when true folders are not used for url path - class S3Storage(Storage): @@ -54,7 +52,7 @@ class S3Storage(Storage): ext = os.path.splitext(key)[1] self.key_dict[key] = f"{str(uuid.uuid4())}{ext}" final_key = self.key_dict[key] - return self.folder + self._clean_path(self.subfolder) + final_key + return os.path.join(self.folder, final_key) def get_cdn_url(self, key): return self.cdn_url.format(bucket=self.bucket, region=self.region, key=self._get_path(key)) diff --git a/utils/gworksheet.py b/utils/gworksheet.py index ad0fe44..0e05ab6 100644 --- a/utils/gworksheet.py +++ b/utils/gworksheet.py @@ -10,10 +10,10 @@ class GWorksheet: """ COLUMN_NAMES = { 'url': 'link', - 'subfolder': 'sub folder', + 'status': 'archive status', + 'folder': 'destination folder', 'archive': 'archive location', 'date': 'archive date', - 'status': 'archive status', 'thumbnail': 'thumbnail', 'thumbnail_index': 'thumbnail index', 'timestamp': 'upload timestamp', @@ -72,12 +72,15 @@ class GWorksheet: return '' return row[col_index] - def get_cell_or_default(self, row, col: str, default: str = None, fresh=False): + def get_cell_or_default(self, row, col: str, default: str = None, fresh=False, when_empty_use_default=True): """ return self.get_cell or default value on error (eg: column is missing) """ try: - return self.get_cell(row, col, fresh) + val = self.get_cell(row, col, fresh) + if when_empty_use_default and val.strip() == "": + return default + return val except: return default diff --git a/utils/misc.py b/utils/misc.py index 2dfd683..c49827e 100644 --- a/utils/misc.py +++ b/utils/misc.py @@ -1,11 +1,11 @@ -import os, requests +import os, sys, requests from loguru import logger def mkdir_if_not_exists(folder): if not os.path.exists(folder): - os.mkdir(folder) + os.makedirs(folder) def expand_url(url): @@ -18,3 +18,11 @@ def expand_url(url): except: logger.error(f'Failed to expand url {url}') return url + +def getattr_or(o: object, prop: str, default: None = None): + try: + res = getattr(o, prop) + if res is None: raise + return res + except: + return default \ No newline at end of file From 3791afc94ccf860461c06886dc586e0c9d1ecdeb Mon Sep 17 00:00:00 2001 From: msramalho <19508417+msramalho@users.noreply.github.com> Date: Tue, 7 Jun 2022 18:43:04 +0200 Subject: [PATCH 33/84] readme updates --- README.md | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 7b7341d..cb91cea 100644 --- a/README.md +++ b/README.md @@ -144,6 +144,7 @@ Code is split into functional concepts: 1. [GWorksheet](utils/gworksheet.py) - facilitates some of the reading/writing tasks for a Google Worksheet ### Current Archivers +Archivers are tested in a meaningful order with Wayback Machine being the default, that can easily be changed in the code. ```mermaid graph TD A(Archiver) -->|parent of| B(YoutubeDLArchiver) @@ -157,8 +158,8 @@ graph TD ```mermaid graph TD A(BaseStorage) -->|parent of| B(S3Storage) - C(BaseStorage) -->|parent of| C(LocalStorage) - A(BaseStorage) -->|parent of| C(GoogleDriveStorage) + A(BaseStorage) -->|parent of| C(LocalStorage) + A(BaseStorage) -->|parent of| D(GoogleDriveStorage) ``` From 29a9a55f87e02fe5ff556c53e379e0cf32e3f6a7 Mon Sep 17 00:00:00 2001 From: msramalho <19508417+msramalho@users.noreply.github.com> Date: Tue, 7 Jun 2022 18:44:52 +0200 Subject: [PATCH 34/84] config refactor --- .example.env | 18 ------------------ example.config.json | 2 +- 2 files changed, 1 insertion(+), 19 deletions(-) delete mode 100644 .example.env diff --git a/.example.env b/.example.env deleted file mode 100644 index 46d7d7a..0000000 --- a/.example.env +++ /dev/null @@ -1,18 +0,0 @@ -DO_SPACES_REGION= -DO_SPACES_KEY= -DO_SPACES_SECRET= -DO_BUCKET= -INTERNET_ARCHIVE_S3_KEY= -INTERNET_ARCHIVE_S3_SECRET= -TELEGRAM_API_ID= -TELEGRAM_API_HASH= - -FACEBOOK_COOKIE=cookie: datr= xxxx - -# Google Drive, Right click on folder, Get link: -# https://drive.google.com/drive/folders/123456789987654321abcdefghijk?usp=sharing -# we want: 123456789987654321abcdefghijk -# Remember to share the folder with the service email -# autoarchiverservice@auto-archiver-333333.iam.gserviceaccount.com -GD_ROOT_FOLDER_ID= - diff --git a/example.config.json b/example.config.json index 4f49832..c633fa7 100644 --- a/example.config.json +++ b/example.config.json @@ -29,7 +29,7 @@ "save_to": "local path to save files in ./local_archive" }, "facebook": { - "cookie": "optional facebook cookie to have more access to content" + "cookie": "optional facebook cookie to have more access to content, from browser, looks like 'cookie: datr= xxxx'" } }, "execution": { From 3019778b8f4e10e0636b9340b481b73ddf7eb864 Mon Sep 17 00:00:00 2001 From: msramalho <19508417+msramalho@users.noreply.github.com> Date: Tue, 7 Jun 2022 18:52:19 +0200 Subject: [PATCH 35/84] readme updates --- README.md | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/README.md b/README.md index cb91cea..d2a4070 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,5 @@ # auto-archiver - -This Python script will look for links to Youtube, Twitter, etc,. in a specified column of a Google Sheet, uses YoutubeDL to download the media, stores the result in a Digital Ocean space or Google Drive, and updates the Google Sheet with the archive location, status, and date. It can be run manually or on an automated basis. +Python script to automatically archive social media posts, videos, and images from a Google Sheets document. Uses different archivers depending on the platform, and can save content to local storage, S3 bucket (Digital Ocean Spaces, AWS, ...), and Google Drive. The Google Sheets where the links come from is updated with information about the archived content. It can be run manually or on an automated basis. ## Setup From 13e7d0bf1b9f35f1da1949644d920ef2d391f8d0 Mon Sep 17 00:00:00 2001 From: msramalho <19508417+msramalho@users.noreply.github.com> Date: Wed, 8 Jun 2022 11:11:09 +0200 Subject: [PATCH 36/84] improving path operations --- archivers/base_archiver.py | 8 ++++---- archivers/telegram_archiver.py | 2 +- archivers/telethon_archiver.py | 5 +++-- archivers/tiktok_archiver.py | 2 +- 4 files changed, 9 insertions(+), 8 deletions(-) diff --git a/archivers/base_archiver.py b/archivers/base_archiver.py index 248d869..d3145e5 100644 --- a/archivers/base_archiver.py +++ b/archivers/base_archiver.py @@ -64,7 +64,7 @@ class Archiver(ABC): page += f"" page_key = self.get_key(urlparse(url).path.replace("/", "_") + ".html") - page_filename = Storage.TMP_FOLDER + page_key + page_filename = os.path.join(Storage.TMP_FOLDER, page_key) page_cdn = self.storage.get_cdn_url(page_key) with open(page_filename, "w") as f: @@ -95,8 +95,8 @@ class Archiver(ABC): key = self.get_key(path.replace("/", "_")) if '.' not in path: key += '.jpg' - - filename = Storage.TMP_FOLDER + key + + filename = os.path.join(Storage.TMP_FOLDER, key) d = requests.get(media_url, headers=headers) with open(filename, 'wb') as f: @@ -140,7 +140,7 @@ class Archiver(ABC): logger.debug(f"getting screenshot for {url=}") key = self.get_key(urlparse(url).path.replace( "/", "_") + datetime.datetime.utcnow().isoformat().replace(" ", "_") + ".png") - filename = Storage.TMP_FOLDER + key + filename = os.path.join(Storage.TMP_FOLDER, key) # Accept cookies popup dismiss for ytdlp video if 'facebook.com' in url: diff --git a/archivers/telegram_archiver.py b/archivers/telegram_archiver.py index 06a8624..22de30e 100644 --- a/archivers/telegram_archiver.py +++ b/archivers/telegram_archiver.py @@ -52,7 +52,7 @@ class TelegramArchiver(Archiver): video_id = video_url.split('/')[-1].split('?')[0] key = self.get_key(video_id) - filename = Storage.TMP_FOLDER + key + filename = os.path.join(Storage.TMP_FOLDER, key) cdn_url = self.storage.get_cdn_url(key) if check_if_exists and self.storage.exists(key): diff --git a/archivers/telethon_archiver.py b/archivers/telethon_archiver.py index 09cb47f..74a4ddd 100644 --- a/archivers/telethon_archiver.py +++ b/archivers/telethon_archiver.py @@ -79,7 +79,8 @@ class TelethonArchiver(Archiver): message = post.message for mp in media_posts: if len(mp.message) > len(message): message = mp.message - filename = self.client.download_media(mp.media, f'{Storage.TMP_FOLDER}{chat}_{group_id}/{mp.id}') + filename_dest = os.path.join(Storage.TMP_FOLDER, f'{chat}_{group_id}', mp.id) + filename = self.client.download_media(mp.media, filename_dest) key = filename.split(Storage.TMP_FOLDER)[1] self.storage.upload(filename, key) hash = self.get_hash(filename) @@ -92,7 +93,7 @@ class TelethonArchiver(Archiver): return ArchiveResult(status=status, cdn_url=page_cdn, title=message, timestamp=post.date, hash=page_hash, screenshot=screenshot) elif len(media_posts) == 1: key = self.get_key(f'{chat}_{post_id}') - filename = self.client.download_media(post.media, f'{Storage.TMP_FOLDER}{key}') + filename = self.client.download_media(post.media, os.path.join(Storage.TMP_FOLDER,key)) key = filename.split(Storage.TMP_FOLDER)[1].replace(" ", "") self.storage.upload(filename, key) hash = self.get_hash(filename) diff --git a/archivers/tiktok_archiver.py b/archivers/tiktok_archiver.py index c132886..0cc8221 100644 --- a/archivers/tiktok_archiver.py +++ b/archivers/tiktok_archiver.py @@ -18,7 +18,7 @@ class TiktokArchiver(Archiver): try: info = tiktok_downloader.info_post(url) key = self.get_key(f'{info.id}.mp4') - filename = Storage.TMP_FOLDER + key + filename = os.path.join(Storage.TMP_FOLDER, key) logger.info(f'found video {key=}') if check_if_exists and self.storage.exists(key): From 90a6083bc1f90168d1dc8d4effd266fe6ed70121 Mon Sep 17 00:00:00 2001 From: msramalho <19508417+msramalho@users.noreply.github.com> Date: Wed, 8 Jun 2022 11:13:12 +0200 Subject: [PATCH 37/84] logs to config --- configs/config.py | 21 +++++++++++---------- example.config.json | 1 + 2 files changed, 12 insertions(+), 10 deletions(-) diff --git a/configs/config.py b/configs/config.py index 23c0be1..9943ab9 100644 --- a/configs/config.py +++ b/configs/config.py @@ -36,15 +36,6 @@ class Config: def __init__(self): self.parser = self.get_argument_parser() self.folder = "" - self.set_log_files() - - def set_log_files(self): - # TODO: isolate to config - logger.add("logs/1trace.log", level="TRACE") - logger.add("logs/2info.log", level="INFO") - logger.add("logs/3success.log", level="SUCCESS") - logger.add("logs/4warning.log", level="WARNING") - logger.add("logs/5error.log", level="ERROR") def parse(self): self.args = self.parser.parse_args() @@ -65,6 +56,8 @@ class Config: self.header = int(getattr_or(self.args, "header", execution.get("header", 1))) Storage.TMP_FOLDER = execution.get("tmp_folder", Storage.TMP_FOLDER) self.storage = getattr_or(self.args, "storage", execution.get("storage", "s3")) + if getattr_or(self.args, "save_logs", False): + self.set_log_files() # Column names come from config and can be overwritten by CMD # in the end all are considered as lower case @@ -80,7 +73,7 @@ class Config: window_width=int(selenium_configs.get("window_width", SeleniumConfig.window_width)), window_height=int(selenium_configs.get("window_height", SeleniumConfig.window_height)) ) - self.webdriver = "not initalized" + self.webdriver = "not initialized" # ---------------------- SECRETS - APIs and service configurations secrets = self.config.get("secrets", {}) @@ -144,6 +137,14 @@ class Config: del self.config["secrets"] # delete to prevent leaks + def set_log_files(self): + # called only when config.execution.save_logs=true + logger.add("logs/1trace.log", level="TRACE") + logger.add("logs/2info.log", level="INFO") + logger.add("logs/3success.log", level="SUCCESS") + logger.add("logs/4warning.log", level="WARNING") + logger.add("logs/5error.log", level="ERROR") + def get_argument_parser(self): """ Creates the CMD line arguments. 'python auto_archive.py --help' diff --git a/example.config.json b/example.config.json index c633fa7..0641622 100644 --- a/example.config.json +++ b/example.config.json @@ -42,6 +42,7 @@ "window_height": 2000 }, "tmp_folder": "tmp/", // local tmp folder to save files before uploading to storage + "save_logs": true, // puts execution logs into /logs folder, defaults to false "column_names": { // custom column names, only needed if different from default, can be overwritten with CMD --col-NAME="VALUE" "url": "link", "archive": "archive location", From c622f941d7a6c8b7c085d4ea698e73affb84fb6b Mon Sep 17 00:00:00 2001 From: msramalho <19508417+msramalho@users.noreply.github.com> Date: Wed, 8 Jun 2022 11:44:49 +0200 Subject: [PATCH 38/84] tiktok bug fix --- archivers/tiktok_archiver.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/archivers/tiktok_archiver.py b/archivers/tiktok_archiver.py index 0cc8221..8100bb1 100644 --- a/archivers/tiktok_archiver.py +++ b/archivers/tiktok_archiver.py @@ -53,10 +53,11 @@ class TiktokArchiver(Archiver): except FileNotFoundError: logger.info(f'tmp file not found thus not deleted {filename}') cdn_url = self.storage.get_cdn_url(key) + timestamp = info.create.isoformat() if hasattr(info, "create") else None return ArchiveResult(status=status, cdn_url=cdn_url, thumbnail=key_thumb, - thumbnail_index=thumb_index, duration=info.duration, title=info.caption, timestamp=info.create.isoformat(), - hash=hash, screenshot=screenshot) + thumbnail_index=thumb_index, duration=getattr(info, "duration", 0), title=getattr(info, "caption", ""), + timestamp=timestamp, hash=hash, screenshot=screenshot) except tiktok_downloader.Except.InvalidUrl as e: status = 'Invalid URL' From a0be3c8a2202f5a18e4c041d55f58e5966c8e8e1 Mon Sep 17 00:00:00 2001 From: msramalho <19508417+msramalho@users.noreply.github.com> Date: Wed, 8 Jun 2022 11:44:57 +0200 Subject: [PATCH 39/84] todo --- archivers/wayback_archiver.py | 1 + 1 file changed, 1 insertion(+) diff --git a/archivers/wayback_archiver.py b/archivers/wayback_archiver.py index 7f90c0a..75bf50b 100644 --- a/archivers/wayback_archiver.py +++ b/archivers/wayback_archiver.py @@ -63,6 +63,7 @@ class WaybackArchiver(Archiver): status_json = status_r.json() if status_json['status'] != 'success': + # TODO: if "please try again" in str(status_json).lower() then this can be retried in the future return ArchiveResult(status='Internet Archive failed: ' + str(status_json)) archive_url = f"https://web.archive.org/web/{status_json['timestamp']}/{status_json['original_url']}" From 1df68c3652ce70a253d34a1faa2cc7a23e6de622 Mon Sep 17 00:00:00 2001 From: msramalho <19508417+msramalho@users.noreply.github.com> Date: Wed, 8 Jun 2022 11:45:04 +0200 Subject: [PATCH 40/84] config bug fix --- configs/config.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/configs/config.py b/configs/config.py index 9943ab9..9454218 100644 --- a/configs/config.py +++ b/configs/config.py @@ -218,8 +218,10 @@ class Config: "tmp_folder": Storage.TMP_FOLDER, "selenium_config": asdict(self.selenium_config), "selenium_webdriver": self.webdriver != None, - "s3_config": self.s3_config != None, - "s3_private": getattr_or(self.s3_config, "private", None), + "s3_config": hasattr(self, "s3_config"), + "gd_config": hasattr(self, "gd_config"), + "local_config": hasattr(self, "local_config"), + "s3_private": getattr_or(getattr(self, "s3_config", {}), "private", None), "wayback_config": self.wayback_config != None, "telegram_config": self.telegram_config != None, "gsheets_client": self.gsheets_client != None, From f0a276e3a51b6e153ab2564064338abd75ac14bc Mon Sep 17 00:00:00 2001 From: msramalho <19508417+msramalho@users.noreply.github.com> Date: Wed, 8 Jun 2022 11:45:38 +0200 Subject: [PATCH 41/84] bug fix --- storages/s3_storage.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/storages/s3_storage.py b/storages/s3_storage.py index f9922a1..ceb75c1 100644 --- a/storages/s3_storage.py +++ b/storages/s3_storage.py @@ -26,10 +26,11 @@ class S3Storage(Storage): def __init__(self, config: S3Config): self.bucket = config.bucket self.region = config.region - self.folder = self._clean_path(config.folder) + self.folder = config.folder self.private = config.private self.cdn_url = config.cdn_url self.key_path = config.key_path + self.key_dict = {} self.s3 = boto3.client( 's3', @@ -65,7 +66,6 @@ class S3Storage(Storage): return False def uploadf(self, file, key, **kwargs): - logger.debug(f'[S3 storage] uploading {file=}, {key=}') if self.private: extra_args = kwargs.get("extra_args", {}) else: From 6dcb59fea6a5017a0606dc8e9778e8531f12e9a3 Mon Sep 17 00:00:00 2001 From: msramalho <19508417+msramalho@users.noreply.github.com> Date: Wed, 8 Jun 2022 11:46:00 +0200 Subject: [PATCH 42/84] removing unnecessary method --- storages/base_storage.py | 6 ------ storages/local_storage.py | 4 ++-- 2 files changed, 2 insertions(+), 8 deletions(-) diff --git a/storages/base_storage.py b/storages/base_storage.py index 5c68f82..cde00fe 100644 --- a/storages/base_storage.py +++ b/storages/base_storage.py @@ -22,9 +22,3 @@ class Storage(ABC): logger.debug(f'[{self.__class__.__name__}] uploading file {filename} with key {key}') with open(filename, 'rb') as f: self.uploadf(f, key, **kwargs) - - #TODO: is this really necessary if only use os.path operations - def _clean_path(self, folder, default="", add_forward_slash=True): - if folder is None or type(folder) != str or len(folder.strip()) == 0: - return default - return str(Path(folder)) + ("/" if add_forward_slash else "") diff --git a/storages/local_storage.py b/storages/local_storage.py index cef9a42..ca328e0 100644 --- a/storages/local_storage.py +++ b/storages/local_storage.py @@ -13,8 +13,8 @@ class LocalConfig: class LocalStorage(Storage): def __init__(self, config:LocalConfig): - self.folder = self._clean_path(config.folder) - self.save_to = self._clean_path(config.save_to) + self.folder = config.folder + self.save_to = config.save_to mkdir_if_not_exists(self.save_to) def get_cdn_url(self, key): From 067e6d89542fede8a7b4f89d86f37732652fcda0 Mon Sep 17 00:00:00 2001 From: msramalho <19508417+msramalho@users.noreply.github.com> Date: Wed, 8 Jun 2022 13:39:52 +0200 Subject: [PATCH 43/84] retry mechanism --- archivers/base_archiver.py | 41 +++++++++++++++++++++++++++++++++-- archivers/wayback_archiver.py | 3 ++- auto_archive.py | 16 ++++++++++---- 3 files changed, 53 insertions(+), 7 deletions(-) diff --git a/archivers/base_archiver.py b/archivers/base_archiver.py index d3145e5..4258cca 100644 --- a/archivers/base_archiver.py +++ b/archivers/base_archiver.py @@ -1,7 +1,8 @@ -import os, datetime, shutil, hashlib, time, requests +import os, datetime, shutil, hashlib, time, requests, re from dataclasses import dataclass from abc import ABC, abstractmethod from urllib.parse import urlparse +from random import randrange import ffmpeg from loguru import logger @@ -27,6 +28,7 @@ class ArchiveResult: class Archiver(ABC): name = "default" + retry_regex = r"retrying at (\d+)$" def __init__(self, storage: Storage, driver): self.storage = storage @@ -95,7 +97,7 @@ class Archiver(ABC): key = self.get_key(path.replace("/", "_")) if '.' not in path: key += '.jpg' - + filename = os.path.join(Storage.TMP_FOLDER, key) d = requests.get(media_url, headers=headers) @@ -226,3 +228,38 @@ class Archiver(ABC): thumb_index_cdn_url = self.storage.get_cdn_url(thumb_index) return (key_thumb, thumb_index_cdn_url) + + def signal_retry_in(self, min_seconds=1800, max_seconds=7200): + """ + sets state to retry in random between (min_seconds, max_seconds) + """ + now = datetime.datetime.now().timestamp() + retry_at = int(now + randrange(min_seconds, max_seconds)) + logger.debug(f"signaling {retry_at=}") + return ArchiveResult(status=f'retrying at {retry_at}') + + def is_retry(status): + return re.search(Archiver.retry_regex, status) is not None + + def should_retry_from_status(status): + """ + checks status against message in signal_retry_in + returns true if enough time has elapsed, false otherwise + """ + match = re.search(Archiver.retry_regex, status) + if match: + retry_at = int(match.group(1)) + now = datetime.datetime.now().timestamp() + should_retry = now >= retry_at + logger.debug(f"{should_retry=} as {now=} >= {retry_at=}") + return should_retry + return False + + def remove_retry(status): + """ + transforms the status from retry into something else + """ + new_status = re.sub(Archiver.retry_regex, "failed: too many retries", status, 0) + logger.debug(f"removing retry message at {status=}, got {new_status=}") + return new_status + diff --git a/archivers/wayback_archiver.py b/archivers/wayback_archiver.py index 75bf50b..6f04725 100644 --- a/archivers/wayback_archiver.py +++ b/archivers/wayback_archiver.py @@ -63,7 +63,8 @@ class WaybackArchiver(Archiver): status_json = status_r.json() if status_json['status'] != 'success': - # TODO: if "please try again" in str(status_json).lower() then this can be retried in the future + if "please try again" in str(status_json).lower(): + return self.signal_retry_in() return ArchiveResult(status='Internet Archive failed: ' + str(status_json)) archive_url = f"https://web.archive.org/web/{status_json['timestamp']}/{status_json['original_url']}" diff --git a/auto_archive.py b/auto_archive.py index 75b105d..a8382b7 100644 --- a/auto_archive.py +++ b/auto_archive.py @@ -1,13 +1,15 @@ -import os, datetime, shutil, traceback +import os, datetime, shutil, traceback, random from loguru import logger from slugify import slugify -from archivers import TelethonArchiver, TelegramArchiver, TiktokArchiver, YoutubeDLArchiver, TwitterArchiver, WaybackArchiver, ArchiveResult +from archivers import TelethonArchiver, TelegramArchiver, TiktokArchiver, YoutubeDLArchiver, TwitterArchiver, WaybackArchiver, ArchiveResult, Archiver from utils import GWorksheet, mkdir_if_not_exists, expand_url from configs import Config from storages import Storage +random.seed() + def update_sheet(gw, row, result: ArchiveResult): cell_updates = [] @@ -72,7 +74,10 @@ def process_sheet(c: Config): original_status = gw.get_cell(row, 'status') status = gw.get_cell(row, 'status', fresh=original_status in ['', None] and url != '') - if url == '' or status not in ['', None]: continue + is_retry = False + if url == '' or status not in ['', None]: + is_retry = Archiver.should_retry_from_status(status) + if not is_retry: continue # All checks done - archival process starts here gw.set_cell(row, 'status', 'Archive in progress') @@ -85,9 +90,9 @@ def process_sheet(c: Config): # order matters, first to succeed excludes remaining active_archivers = [ TelethonArchiver(storage, c.webdriver, c.telegram_config), - TelegramArchiver(storage, c.webdriver), TiktokArchiver(storage, c.webdriver), YoutubeDLArchiver(storage, c.webdriver, c.facebook_cookie), + TelegramArchiver(storage, c.webdriver), TwitterArchiver(storage, c.webdriver), WaybackArchiver(storage, c.webdriver, c.wayback_config) ] @@ -113,6 +118,9 @@ def process_sheet(c: Config): if success: logger.success(f'{archiver.name} succeeded on {row=}, {url=}') break + # only 1 retry possible for now + if is_retry and Archiver.is_retry(result.status): + result.status = Archiver.remove_retry(result.status) logger.warning(f'{archiver.name} did not succeed on {row=}, final status: {result.status}') if result: From 562d2f51ad5d8b7407b900543cc9b1d22bce95e0 Mon Sep 17 00:00:00 2001 From: msramalho <19508417+msramalho@users.noreply.github.com> Date: Wed, 8 Jun 2022 13:39:57 +0200 Subject: [PATCH 44/84] bot token --- archivers/telethon_archiver.py | 9 +++++---- configs/config.py | 3 ++- configs/telethon_config.py | 3 ++- example.config.json | 3 ++- utils/misc.py | 2 +- 5 files changed, 12 insertions(+), 8 deletions(-) diff --git a/archivers/telethon_archiver.py b/archivers/telethon_archiver.py index 74a4ddd..95bf288 100644 --- a/archivers/telethon_archiver.py +++ b/archivers/telethon_archiver.py @@ -17,6 +17,7 @@ class TelethonArchiver(Archiver): def __init__(self, storage: Storage, driver, config: TelethonConfig): super().__init__(storage, driver) self.client = TelegramClient("./anon", config.api_id, config.api_hash) + self.bot_token = config.bot_token def _get_media_posts_in_group(self, chat, original_post, max_amp=10): """ @@ -45,7 +46,7 @@ class TelethonArchiver(Archiver): status = "success" # app will ask (stall for user input!) for phone number and auth code if anon.session not found - with self.client.start(): + with self.client.start(bot_token=self.bot_token): matches = list(matches[0]) chat, post_id = matches[1], matches[2] @@ -57,11 +58,11 @@ class TelethonArchiver(Archiver): logger.error(f"Could not fetch telegram {url} possibly it's private: {e}") return False except ChannelInvalidError as e: - # TODO: check followup here: https://github.com/LonamiWebs/Telethon/issues/3819 - logger.error(f"Could not fetch telegram {url} possibly it's private or not displayable in : {e}") + logger.error(f"Could not fetch telegram {url}. This error can be fixed if you setup a bot_token in addition to api_id and api_hash: {e}") return False media_posts = self._get_media_posts_in_group(chat, post) + logger.debug(f'got {len(media_posts)=} for {url=}') screenshot = self.get_screenshot(url) @@ -93,7 +94,7 @@ class TelethonArchiver(Archiver): return ArchiveResult(status=status, cdn_url=page_cdn, title=message, timestamp=post.date, hash=page_hash, screenshot=screenshot) elif len(media_posts) == 1: key = self.get_key(f'{chat}_{post_id}') - filename = self.client.download_media(post.media, os.path.join(Storage.TMP_FOLDER,key)) + filename = self.client.download_media(post.media, os.path.join(Storage.TMP_FOLDER, key)) key = filename.split(Storage.TMP_FOLDER)[1].replace(" ", "") self.storage.upload(filename, key) hash = self.get_hash(filename) diff --git a/configs/config.py b/configs/config.py index 9454218..8bc6ad0 100644 --- a/configs/config.py +++ b/configs/config.py @@ -130,7 +130,8 @@ class Config: if "telegram" in secrets: self.telegram_config = TelethonConfig( api_id=secrets["telegram"]["api_id"], - api_hash=secrets["telegram"]["api_hash"] + api_hash=secrets["telegram"]["api_hash"], + bot_token=getattr_or(secrets["telegram"], "bot_token") ) else: logger.debug(f"'telegram' key not present in the {self.config_file=}") diff --git a/configs/telethon_config.py b/configs/telethon_config.py index adf121b..2109469 100644 --- a/configs/telethon_config.py +++ b/configs/telethon_config.py @@ -4,4 +4,5 @@ from dataclasses import dataclass @dataclass class TelethonConfig: api_id: str - api_hash: str \ No newline at end of file + api_hash: str + bot_token: str \ No newline at end of file diff --git a/example.config.json b/example.config.json index 0641622..8d8b26e 100644 --- a/example.config.json +++ b/example.config.json @@ -16,7 +16,8 @@ }, "telegram": { "api_id": "your API key, see https://telegra.ph/How-to-get-Telegram-APP-ID--API-HASH-05-27", - "api_hash": "your API hash" + "api_hash": "your API hash", + "bot_token": "optional, but allows access to more content such as large videos, talk to @botfather" }, "google_sheets": { "service_account": "normally service_account.json, see https://gspread.readthedocs.io/en/latest/oauth2.html#for-bots-using-service-account" diff --git a/utils/misc.py b/utils/misc.py index c49827e..cd02c21 100644 --- a/utils/misc.py +++ b/utils/misc.py @@ -19,7 +19,7 @@ def expand_url(url): logger.error(f'Failed to expand url {url}') return url -def getattr_or(o: object, prop: str, default: None = None): +def getattr_or(o: object, prop: str, default = None): try: res = getattr(o, prop) if res is None: raise From 9e871b3bbc1ebab730e8168d3016abb73f5c3e41 Mon Sep 17 00:00:00 2001 From: msramalho <19508417+msramalho@users.noreply.github.com> Date: Wed, 8 Jun 2022 13:46:20 +0200 Subject: [PATCH 45/84] telethon config fixes --- configs/config.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/configs/config.py b/configs/config.py index 8bc6ad0..0370020 100644 --- a/configs/config.py +++ b/configs/config.py @@ -131,7 +131,7 @@ class Config: self.telegram_config = TelethonConfig( api_id=secrets["telegram"]["api_id"], api_hash=secrets["telegram"]["api_hash"], - bot_token=getattr_or(secrets["telegram"], "bot_token") + bot_token=secrets["telegram"].get("bot_token", None) ) else: logger.debug(f"'telegram' key not present in the {self.config_file=}") From bd5146ac3e9f8295ebcc01659dc052788e64eb91 Mon Sep 17 00:00:00 2001 From: msramalho <19508417+msramalho@users.noreply.github.com> Date: Wed, 8 Jun 2022 18:17:25 +0200 Subject: [PATCH 46/84] bug fixes --- archivers/base_archiver.py | 4 ++-- archivers/telethon_archiver.py | 2 +- archivers/wayback_archiver.py | 18 +++++++++++------- configs/config.py | 2 +- 4 files changed, 15 insertions(+), 11 deletions(-) diff --git a/archivers/base_archiver.py b/archivers/base_archiver.py index 4258cca..b3d872b 100644 --- a/archivers/base_archiver.py +++ b/archivers/base_archiver.py @@ -229,14 +229,14 @@ class Archiver(ABC): return (key_thumb, thumb_index_cdn_url) - def signal_retry_in(self, min_seconds=1800, max_seconds=7200): + def signal_retry_in(self, min_seconds=1800, max_seconds=7200, **kwargs): """ sets state to retry in random between (min_seconds, max_seconds) """ now = datetime.datetime.now().timestamp() retry_at = int(now + randrange(min_seconds, max_seconds)) logger.debug(f"signaling {retry_at=}") - return ArchiveResult(status=f'retrying at {retry_at}') + return ArchiveResult(status=f'retrying at {retry_at}', **kwargs) def is_retry(status): return re.search(Archiver.retry_regex, status) is not None diff --git a/archivers/telethon_archiver.py b/archivers/telethon_archiver.py index 95bf288..18996d8 100644 --- a/archivers/telethon_archiver.py +++ b/archivers/telethon_archiver.py @@ -80,7 +80,7 @@ class TelethonArchiver(Archiver): message = post.message for mp in media_posts: if len(mp.message) > len(message): message = mp.message - filename_dest = os.path.join(Storage.TMP_FOLDER, f'{chat}_{group_id}', mp.id) + filename_dest = os.path.join(Storage.TMP_FOLDER, f'{chat}_{group_id}', str(mp.id)) filename = self.client.download_media(mp.media, filename_dest) key = filename.split(Storage.TMP_FOLDER)[1] self.storage.upload(filename, key) diff --git a/archivers/wayback_archiver.py b/archivers/wayback_archiver.py index 6f04725..700194d 100644 --- a/archivers/wayback_archiver.py +++ b/archivers/wayback_archiver.py @@ -27,6 +27,7 @@ class WaybackArchiver(Archiver): if req.status_code == 200: return self.if_archived_return_with_screenshot(url, archive_url, req=req, status='already archived') + screenshot = self.get_screenshot(url) logger.debug(f"POSTing {url=} to web.archive.org") ia_headers = { "Accept": "application/json", @@ -36,11 +37,13 @@ class WaybackArchiver(Archiver): if r.status_code != 200: logger.warning(f"Internet archive failed with status of {r.status_code}") - return ArchiveResult(status="Internet archive failed") + return ArchiveResult(status="Internet archive failed", screenshot=screenshot) if 'job_id' not in r.json() and 'message' in r.json(): + if "please try again" in str(r.json()).lower(): + return self.signal_retry_in(screenshot=screenshot) logger.warning(f"Internet archive failed json \n {r.json()}") - return ArchiveResult(status=f"Internet archive failed: {r.json()['message']}") + return ArchiveResult(status=f"Internet archive failed: {r.json()['message']}", screenshot=screenshot) job_id = r.json()['job_id'] logger.debug(f"GETting status for {job_id=} on {url=}") @@ -59,18 +62,19 @@ class WaybackArchiver(Archiver): retries += 1 if status_r.status_code != 200: - return ArchiveResult(status="Internet archive failed") + return ArchiveResult(status="Internet archive failed", screenshot=screenshot) status_json = status_r.json() if status_json['status'] != 'success': + logger.info(f'please try again" in str(status_json).lower(): {("please try again" in str(status_json).lower())}') if "please try again" in str(status_json).lower(): - return self.signal_retry_in() - return ArchiveResult(status='Internet Archive failed: ' + str(status_json)) + return self.signal_retry_in(screenshot=screenshot) + return ArchiveResult(status='Internet Archive failed: ' + str(status_json), screenshot=screenshot) archive_url = f"https://web.archive.org/web/{status_json['timestamp']}/{status_json['original_url']}" return self.if_archived_return_with_screenshot(archive_url) - def if_archived_return_with_screenshot(self, url, archive_url, req=None, status='success'): + def if_archived_return_with_screenshot(self, url, archive_url, screenshot=None, req=None, status='success'): try: if req is None: req = requests.get(archive_url) @@ -80,6 +84,6 @@ class WaybackArchiver(Archiver): title = 'Could not get title' except: title = "Could not get title" - screenshot = self.get_screenshot(url) + screenshot = screenshot or self.get_screenshot(url) self.seen_urls[url] = ArchiveResult(status=status, cdn_url=archive_url, title=title, screenshot=screenshot) return self.seen_urls[url] diff --git a/configs/config.py b/configs/config.py index 0370020..70b2046 100644 --- a/configs/config.py +++ b/configs/config.py @@ -220,9 +220,9 @@ class Config: "selenium_config": asdict(self.selenium_config), "selenium_webdriver": self.webdriver != None, "s3_config": hasattr(self, "s3_config"), + "s3_private": getattr_or(getattr(self, "s3_config", {}), "private", None), "gd_config": hasattr(self, "gd_config"), "local_config": hasattr(self, "local_config"), - "s3_private": getattr_or(getattr(self, "s3_config", {}), "private", None), "wayback_config": self.wayback_config != None, "telegram_config": self.telegram_config != None, "gsheets_client": self.gsheets_client != None, From c8a02cb93a0bd93164fbf990a0c0a06a71a12ce6 Mon Sep 17 00:00:00 2001 From: msramalho <19508417+msramalho@users.noreply.github.com> Date: Wed, 8 Jun 2022 18:25:58 +0200 Subject: [PATCH 47/84] new wayback error action --- archivers/wayback_archiver.py | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/archivers/wayback_archiver.py b/archivers/wayback_archiver.py index 700194d..f961799 100644 --- a/archivers/wayback_archiver.py +++ b/archivers/wayback_archiver.py @@ -40,10 +40,7 @@ class WaybackArchiver(Archiver): return ArchiveResult(status="Internet archive failed", screenshot=screenshot) if 'job_id' not in r.json() and 'message' in r.json(): - if "please try again" in str(r.json()).lower(): - return self.signal_retry_in(screenshot=screenshot) - logger.warning(f"Internet archive failed json \n {r.json()}") - return ArchiveResult(status=f"Internet archive failed: {r.json()['message']}", screenshot=screenshot) + return self.custom_retry(r.json(), screenshot=screenshot) job_id = r.json()['job_id'] logger.debug(f"GETting status for {job_id=} on {url=}") @@ -66,10 +63,7 @@ class WaybackArchiver(Archiver): status_json = status_r.json() if status_json['status'] != 'success': - logger.info(f'please try again" in str(status_json).lower(): {("please try again" in str(status_json).lower())}') - if "please try again" in str(status_json).lower(): - return self.signal_retry_in(screenshot=screenshot) - return ArchiveResult(status='Internet Archive failed: ' + str(status_json), screenshot=screenshot) + return self.custom_retry(status_json, screenshot=screenshot) archive_url = f"https://web.archive.org/web/{status_json['timestamp']}/{status_json['original_url']}" return self.if_archived_return_with_screenshot(archive_url) @@ -87,3 +81,11 @@ class WaybackArchiver(Archiver): screenshot = screenshot or self.get_screenshot(url) self.seen_urls[url] = ArchiveResult(status=status, cdn_url=archive_url, title=title, screenshot=screenshot) return self.seen_urls[url] + + def custom_retry(self, json_data, **kwargs): + logger.warning(f"Internet archive failed json \n {json_data}") + if "please try again" in str(json_data).lower(): + return self.signal_retry_in(**kwargs) + if "this host has been already captured" in str(json_data).lower(): + return self.signal_retry_in(**kwargs, min_seconds=86400, max_seconds=129600) # 24h to 36h later + return ArchiveResult(status=f"Internet archive failed: {json_data}", **kwargs) From d9b8c48af0a3d86f9721b64faa92497e4237f693 Mon Sep 17 00:00:00 2001 From: msramalho <19508417+msramalho@users.noreply.github.com> Date: Tue, 14 Jun 2022 19:19:14 +0200 Subject: [PATCH 48/84] missing parameter bug fix --- archivers/wayback_archiver.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/archivers/wayback_archiver.py b/archivers/wayback_archiver.py index f961799..f75224d 100644 --- a/archivers/wayback_archiver.py +++ b/archivers/wayback_archiver.py @@ -66,7 +66,7 @@ class WaybackArchiver(Archiver): return self.custom_retry(status_json, screenshot=screenshot) archive_url = f"https://web.archive.org/web/{status_json['timestamp']}/{status_json['original_url']}" - return self.if_archived_return_with_screenshot(archive_url) + return self.if_archived_return_with_screenshot(url, archive_url) def if_archived_return_with_screenshot(self, url, archive_url, screenshot=None, req=None, status='success'): try: From 06e8781f0f0ba9d8e623529e18f85a3609af5230 Mon Sep 17 00:00:00 2001 From: msramalho <19508417+msramalho@users.noreply.github.com> Date: Tue, 14 Jun 2022 19:19:20 +0200 Subject: [PATCH 49/84] clarify it is a filename --- example.config.json | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/example.config.json b/example.config.json index 8d8b26e..c00865a 100644 --- a/example.config.json +++ b/example.config.json @@ -20,10 +20,10 @@ "bot_token": "optional, but allows access to more content such as large videos, talk to @botfather" }, "google_sheets": { - "service_account": "normally service_account.json, see https://gspread.readthedocs.io/en/latest/oauth2.html#for-bots-using-service-account" + "service_account": "local filename: normally service_account.json, see https://gspread.readthedocs.io/en/latest/oauth2.html#for-bots-using-service-account" }, "google_drive": { // for storage=gd - "service_account": "can be the same or different file from google_sheets defaults to service_account.json", + "service_account": "local filename: can be the same or different file from google_sheets.service_account defaults to service_account.json", "root_folder_id": "copy XXXX from https://drive.google.com/drive/folders/XXXX" }, "local":{ // for storage=local From eca10023b008b8c5300f60fc0cf0bef2999ad041 Mon Sep 17 00:00:00 2001 From: msramalho <19508417+msramalho@users.noreply.github.com> Date: Tue, 14 Jun 2022 19:28:34 +0200 Subject: [PATCH 50/84] detecting errors at a higher level to avoid false "in progress" messages --- auto_archive.py | 89 ++++++++++++++++++++++++++----------------------- 1 file changed, 47 insertions(+), 42 deletions(-) diff --git a/auto_archive.py b/auto_archive.py index a8382b7..713928b 100644 --- a/auto_archive.py +++ b/auto_archive.py @@ -80,54 +80,59 @@ def process_sheet(c: Config): if not is_retry: continue # All checks done - archival process starts here - gw.set_cell(row, 'status', 'Archive in progress') - url = expand_url(url) - c.set_folder(gw.get_cell_or_default(row, 'folder', default_folder, when_empty_use_default=True)) + try: + gw.set_cell(row, 'status', 'Archive in progress') + url = expand_url(url) + c.set_folder(gw.get_cell_or_default(row, 'folder', default_folder, when_empty_use_default=True)) - # make a new driver so each spreadsheet row is idempotent - c.recreate_webdriver() + # make a new driver so each spreadsheet row is idempotent + c.recreate_webdriver() - # order matters, first to succeed excludes remaining - active_archivers = [ - TelethonArchiver(storage, c.webdriver, c.telegram_config), - TiktokArchiver(storage, c.webdriver), - YoutubeDLArchiver(storage, c.webdriver, c.facebook_cookie), - TelegramArchiver(storage, c.webdriver), - TwitterArchiver(storage, c.webdriver), - WaybackArchiver(storage, c.webdriver, c.wayback_config) - ] + # order matters, first to succeed excludes remaining + active_archivers = [ + TelethonArchiver(storage, c.webdriver, c.telegram_config), + TiktokArchiver(storage, c.webdriver), + YoutubeDLArchiver(storage, c.webdriver, c.facebook_cookie), + TelegramArchiver(storage, c.webdriver), + TwitterArchiver(storage, c.webdriver), + WaybackArchiver(storage, c.webdriver, c.wayback_config) + ] - for archiver in active_archivers: - logger.debug(f'Trying {archiver} on {row=}') + for archiver in active_archivers: + logger.debug(f'Trying {archiver} on {row=}') - try: - result = archiver.download(url, check_if_exists=True) - except KeyboardInterrupt: - # catches keyboard interruptions to do a clean exit - logger.warning(f"caught interrupt for {archiver} on {row=}") - gw.set_cell(row, 'status', '') - c.destroy_webdriver() - exit() - except Exception as e: - result = False - logger.error(f'Got unexpected error in row {row} with {archiver.name} for {url=}: {e}\n{traceback.format_exc()}') + try: + result = archiver.download(url, check_if_exists=True) + except KeyboardInterrupt as e: raise e # so the higher level catch can catch it + except Exception as e: + result = False + logger.error(f'Got unexpected error in row {row} with {archiver.name} for {url=}: {e}\n{traceback.format_exc()}') + + if result: + success = result.status in ['success', 'already archived'] + result.status = f"{archiver.name}: {result.status}" + if success: + logger.success(f'{archiver.name} succeeded on {row=}, {url=}') + break + # only 1 retry possible for now + if is_retry and Archiver.is_retry(result.status): + result.status = Archiver.remove_retry(result.status) + logger.warning(f'{archiver.name} did not succeed on {row=}, final status: {result.status}') if result: - success = result.status in ['success', 'already archived'] - result.status = f"{archiver.name}: {result.status}" - if success: - logger.success(f'{archiver.name} succeeded on {row=}, {url=}') - break - # only 1 retry possible for now - if is_retry and Archiver.is_retry(result.status): - result.status = Archiver.remove_retry(result.status) - logger.warning(f'{archiver.name} did not succeed on {row=}, final status: {result.status}') - - if result: - update_sheet(gw, row, result) - else: - gw.set_cell(row, 'status', 'failed: no archiver') - logger.success(f'Finshed worksheet {wks.title}') + update_sheet(gw, row, result) + else: + gw.set_cell(row, 'status', 'failed: no archiver') + except KeyboardInterrupt: + # catches keyboard interruptions to do a clean exit + logger.warning(f"caught interrupt on {row=}, {url=}") + gw.set_cell(row, 'status', '') + c.destroy_webdriver() + exit() + except Exception as e: + logger.error(f'Got unexpected error in row {row} for {url=}: {e}\n{traceback.format_exc()}') + gw.set_cell(row, 'status', 'failed: unexpected error (see logs)') + logger.success(f'Finished worksheet {wks.title}') @logger.catch From c11a20825372a4cfbbfcd0c1af822eebf1b51340 Mon Sep 17 00:00:00 2001 From: msramalho <19508417+msramalho@users.noreply.github.com> Date: Tue, 14 Jun 2022 19:54:08 +0200 Subject: [PATCH 51/84] more verbose about mandatory columns --- README.md | 6 +++--- configs/config.py | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index d2a4070..42e212c 100644 --- a/README.md +++ b/README.md @@ -81,16 +81,16 @@ python auto_archive.py --s3-private To use Google Drive storage you need the id of the shared folder in the `config.json` file which must be shared with the service account eg `autoarchiverservice@auto-archiver-111111.iam.gserviceaccount.com` and then you can use `--storage=gd` #### Telethon (Telegrams API Library) -The first time you run, you will be prompted to do a authentication with the phone number associated, alternativelly you can put your `anon.session` in the root. +The first time you run, you will be prompted to do a authentication with the phone number associated, alternatively you can put your `anon.session` in the root. ## Running The `--sheet name` property (or `execution.sheet` in the JSON file) is the name of the Google Sheet to check for URLs. This sheet must have been shared with the Google Service account used by `gspread`. -This sheet must also have specific columns (case-insensitive) in the `header` row (see `COLUMN_NAMES` in [gworksheet.py](utils/gworksheet.py)): +This sheet must also have specific columns (case-insensitive) in the `header` row (see `COLUMN_NAMES` in [gworksheet.py](utils/gworksheet.py)), only the `link` and `status` columns are mandatory: * `Link` (required): the location of the media to be archived. This is the only column that should be supplied with data initially -* `Destination folder`: (optional) by default files are saved to a folder called `name-of-sheets-document/name-of-sheets-tab/` using this option you can organize documents into folder from the sheet. * `Archive status` (required): the status of the auto archiver script. Any row with text in this column will be skipped automatically. +* `Destination folder`: (optional) by default files are saved to a folder called `name-of-sheets-document/name-of-sheets-tab/` using this option you can organize documents into folder from the sheet. * `Archive location`: the location of the archived version. For files that were not able to be auto archived, this can be manually updated. * `Archive date`: the date that the auto archiver script ran for this file * `Upload timestamp`: the timestamp extracted from the video. (For YouTube, this unfortunately does not currently include the time) diff --git a/configs/config.py b/configs/config.py index 70b2046..bfab204 100644 --- a/configs/config.py +++ b/configs/config.py @@ -150,7 +150,7 @@ class Config: """ Creates the CMD line arguments. 'python auto_archive.py --help' """ - parser = argparse.ArgumentParser(description='Automatically archive social media posts, videos, and images from a Google Sheets document. The command line arguments will always override the configurations in the provided JSON config file (--config), only some high-level options are allowed via the command line and the JSON configuration file is the preferred method. ') + parser = argparse.ArgumentParser(description='Automatically archive social media posts, videos, and images from a Google Sheets document. The command line arguments will always override the configurations in the provided JSON config file (--config), only some high-level options are allowed via the command line and the JSON configuration file is the preferred method. The sheet must have the "url" and "status" for the archiver to work. ') parser.add_argument('--config', action='store', dest='config', help='the filename of the JSON configuration file (defaults to \'config.json\')', default='config.json') parser.add_argument('--storage', action='store', dest='storage', help='which storage to use [execution.storage in config.json]', choices=Config.AVAILABLE_STORAGES) From 22f20ba74435c3641aaef97ade3fad63c875e7a9 Mon Sep 17 00:00:00 2001 From: msramalho <19508417+msramalho@users.noreply.github.com> Date: Tue, 14 Jun 2022 19:54:53 +0200 Subject: [PATCH 52/84] improve debugging --- storages/gd_storage.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/storages/gd_storage.py b/storages/gd_storage.py index f8efc32..e772a90 100644 --- a/storages/gd_storage.py +++ b/storages/gd_storage.py @@ -32,7 +32,7 @@ class GDStorage(Storage): parent_id, folder_id = self.root_folder_id, None path_parts = full_name.split(os.path.sep) filename = path_parts[-1] - logger.info(f"looking for folders for {path_parts=} before uploading {filename=}") + logger.info(f"looking for folders for {path_parts[0:-1]} before uploading {filename=}") for folder in path_parts[0:-1]: folder_id = self._get_id_from_parent_and_name(parent_id, folder, use_mime_type=True, raise_on_missing=True) parent_id = folder_id From 6499161f5c1bea9452e43d85330c7d033e2768b4 Mon Sep 17 00:00:00 2001 From: msramalho <19508417+msramalho@users.noreply.github.com> Date: Tue, 14 Jun 2022 19:55:05 +0200 Subject: [PATCH 53/84] fixing gd bug on twitter images --- archivers/base_archiver.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/archivers/base_archiver.py b/archivers/base_archiver.py index b3d872b..7e9c85a 100644 --- a/archivers/base_archiver.py +++ b/archivers/base_archiver.py @@ -65,9 +65,9 @@ class Archiver(ABC): page += f"

{self.name} object data:

{object}" page += f"" + #TODO: slugify page_key = self.get_key(urlparse(url).path.replace("/", "_") + ".html") page_filename = os.path.join(Storage.TMP_FOLDER, page_key) - page_cdn = self.storage.get_cdn_url(page_key) with open(page_filename, "w") as f: f.write(page) @@ -94,6 +94,7 @@ class Archiver(ABC): uploaded_media = [] for media_url in urls: path = urlparse(media_url).path + #TODO: slugify key = self.get_key(path.replace("/", "_")) if '.' not in path: key += '.jpg' From 12648bbce90b15af125b9f387be2ac4a98552876 Mon Sep 17 00:00:00 2001 From: msramalho <19508417+msramalho@users.noreply.github.com> Date: Tue, 14 Jun 2022 20:15:14 +0200 Subject: [PATCH 54/84] centralizing slugify url method --- archivers/base_archiver.py | 34 +++++++++++++++++++++------------- 1 file changed, 21 insertions(+), 13 deletions(-) diff --git a/archivers/base_archiver.py b/archivers/base_archiver.py index 7e9c85a..18e4c1b 100644 --- a/archivers/base_archiver.py +++ b/archivers/base_archiver.py @@ -8,6 +8,7 @@ import ffmpeg from loguru import logger from selenium.common.exceptions import TimeoutException from selenium.webdriver.common.by import By +from slugify import slugify from storages import Storage from utils import mkdir_if_not_exists @@ -46,9 +47,6 @@ class Archiver(ABC): def get_netloc(self, url): return urlparse(url).netloc - def get_html_key(self, url): - return self.get_key(urlparse(url).path.replace("/", "_") + ".html") - # generate the html page eg SM3013/twitter__minmyatnaing13_status_1499415562937503751.html def generate_media_page_html(self, url, urls_info: dict, object, thumbnail=None): """ @@ -65,8 +63,7 @@ class Archiver(ABC): page += f"

{self.name} object data:

{object}" page += f"" - #TODO: slugify - page_key = self.get_key(urlparse(url).path.replace("/", "_") + ".html") + page_key = self.get_html_key(url) page_filename = os.path.join(Storage.TMP_FOLDER, page_key) with open(page_filename, "w") as f: @@ -93,11 +90,7 @@ class Archiver(ABC): thumbnail = None uploaded_media = [] for media_url in urls: - path = urlparse(media_url).path - #TODO: slugify - key = self.get_key(path.replace("/", "_")) - if '.' not in path: - key += '.jpg' + key = self._get_key_from_url(media_url, ".jpg") filename = os.path.join(Storage.TMP_FOLDER, key) @@ -130,6 +123,23 @@ class Archiver(ABC): return f'{self.name}_{_id}{extension}' + def get_html_key(self, url): + return self._get_key_from_url(url, ".html") + + def _get_key_from_url(self, url, with_extension: str = None, append_datetime: bool = False): + """ + Receives a URL and returns a slugified version of the URL path + if a string is passed in @with_extension the slug is appended with it if there is no "." in the slug + if @append_date is true, the key adds a timestamp after the URL slug and before the extension + """ + slug = slugify(urlparse(url).path) + if append_datetime: + slug += "-" + slugify(datetime.datetime.utcnow().isoformat()) + if with_extension is not None: + if "." not in slug: + slug += with_extension + return self.get_key(slug) + def get_hash(self, filename): with open(filename, "rb") as f: bytes = f.read() # read entire file as bytes @@ -141,8 +151,7 @@ class Archiver(ABC): def get_screenshot(self, url): logger.debug(f"getting screenshot for {url=}") - key = self.get_key(urlparse(url).path.replace( - "/", "_") + datetime.datetime.utcnow().isoformat().replace(" ", "_") + ".png") + key = self._get_key_from_url(url, ".png", append_datetime=True) filename = os.path.join(Storage.TMP_FOLDER, key) # Accept cookies popup dismiss for ytdlp video @@ -263,4 +272,3 @@ class Archiver(ABC): new_status = re.sub(Archiver.retry_regex, "failed: too many retries", status, 0) logger.debug(f"removing retry message at {status=}, got {new_status=}") return new_status - From bd753b27ed262dac8dc27edebfa1bdd0ec90929b Mon Sep 17 00:00:00 2001 From: msramalho <19508417+msramalho@users.noreply.github.com> Date: Tue, 14 Jun 2022 20:55:30 +0200 Subject: [PATCH 55/84] numbers in markdown --- README.md | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 42e212c..80ff87f 100644 --- a/README.md +++ b/README.md @@ -7,10 +7,10 @@ If you are using `pipenv` (recommended), `pipenv install` is sufficient to insta You also need: 1. [A Google Service account is necessary for use with `gspread`.](https://gspread.readthedocs.io/en/latest/oauth2.html#for-bots-using-service-account) Credentials for this account should be stored in `service_account.json`, in the same directory as the script. -1. [ffmpeg](https://www.ffmpeg.org/) must also be installed locally for this tool to work. -1. [firefox](https://www.mozilla.org/en-US/firefox/new/) and [geckodriver](https://github.com/mozilla/geckodriver/releases) on a path folder like `/usr/local/bin`. -1. [fonts-noto](https://fonts.google.com/noto) to deal with multiple unicode characters during selenium/geckodriver's screenshots: `sudo apt install fonts-noto -y`. -1. Internet Archive credentials can be retrieved from https://archive.org/account/s3.php. +2. [ffmpeg](https://www.ffmpeg.org/) must also be installed locally for this tool to work. +3. [firefox](https://www.mozilla.org/en-US/firefox/new/) and [geckodriver](https://github.com/mozilla/geckodriver/releases) on a path folder like `/usr/local/bin`. +4. [fonts-noto](https://fonts.google.com/noto) to deal with multiple unicode characters during selenium/geckodriver's screenshots: `sudo apt install fonts-noto -y`. +5. Internet Archive credentials can be retrieved from https://archive.org/account/s3.php. ### Configuration file Configuration is done via a config.json file (see [example.config.json](example.config.json)) and some properties of that file can be overwritten via command line arguments. Here is the current result from running the `python auto_archive.py --help`: From 2be539d39ecdf53da5eb6f913e0bdb9eb532aa5c Mon Sep 17 00:00:00 2001 From: msramalho <19508417+msramalho@users.noreply.github.com> Date: Tue, 14 Jun 2022 20:55:43 +0200 Subject: [PATCH 56/84] twitter archiver improvements --- archivers/twitter_archiver.py | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/archivers/twitter_archiver.py b/archivers/twitter_archiver.py index 04ed578..29c43fe 100644 --- a/archivers/twitter_archiver.py +++ b/archivers/twitter_archiver.py @@ -1,6 +1,8 @@ -from snscrape.modules.twitter import TwitterTweetScraper, Video, Gif, Photo -from loguru import logger + +import html from urllib.parse import urlparse +from loguru import logger +from snscrape.modules.twitter import TwitterTweetScraper, Video, Gif, Photo from .base_archiver import Archiver, ArchiveResult @@ -11,6 +13,7 @@ class TwitterArchiver(Archiver): def download(self, url, check_if_exists=False): if 'twitter.com' != self.get_netloc(url): + logger.debug(f'{url=} is not from twitter') return False tweet_id = urlparse(url).path.split('/') @@ -18,6 +21,7 @@ class TwitterArchiver(Archiver): i = tweet_id.index('status') tweet_id = tweet_id[i + 1] else: + logger.debug(f'{url=} does not contain "status"') return False scr = TwitterTweetScraper(tweet_id) @@ -29,8 +33,10 @@ class TwitterArchiver(Archiver): return False if tweet.media is None: - logger.trace(f'No media found') - return False + logger.debug(f'No media found, archiving tweet text only') + screenshot = self.get_screenshot(url) + page_cdn, page_hash, _ = self.generate_media_page_html(url, [], html.escape(tweet.json())) + return ArchiveResult(status="success", cdn_url=page_cdn, title=tweet.content, timestamp=tweet.date, hash=page_hash, screenshot=screenshot) urls = [] @@ -50,4 +56,4 @@ class TwitterArchiver(Archiver): screenshot = self.get_screenshot(url) - return ArchiveResult(status="success", cdn_url=page_cdn, screenshot=screenshot, hash=page_hash, thumbnail=thumbnail, timestamp=tweet.date) + return ArchiveResult(status="success", cdn_url=page_cdn, screenshot=screenshot, hash=page_hash, thumbnail=thumbnail, timestamp=tweet.date, title=tweet.content) From 64c083b37be6de46cdf4fe296800b6669f66fec4 Mon Sep 17 00:00:00 2001 From: msramalho <19508417+msramalho@users.noreply.github.com> Date: Tue, 14 Jun 2022 20:55:59 +0200 Subject: [PATCH 57/84] wayback should re-archive even if old version exists --- archivers/wayback_archiver.py | 19 +++++++------------ 1 file changed, 7 insertions(+), 12 deletions(-) diff --git a/archivers/wayback_archiver.py b/archivers/wayback_archiver.py index f75224d..81c1644 100644 --- a/archivers/wayback_archiver.py +++ b/archivers/wayback_archiver.py @@ -9,6 +9,10 @@ from configs import WaybackConfig class WaybackArchiver(Archiver): + """ + This archiver could implement a check_if_exists by going to "https://web.archive.org/web/{url}" + but that might not be desirable since the webpage might have been archived a long time ago and thus have changed + """ name = "wayback" def __init__(self, storage: Storage, driver, config: WaybackConfig): @@ -21,12 +25,6 @@ class WaybackArchiver(Archiver): if check_if_exists: if url in self.seen_urls: return self.seen_urls[url] - logger.debug(f"checking if {url=} already on archive.org") - archive_url = f"https://web.archive.org/web/{url}" - req = requests.get(archive_url) - if req.status_code == 200: - return self.if_archived_return_with_screenshot(url, archive_url, req=req, status='already archived') - screenshot = self.get_screenshot(url) logger.debug(f"POSTing {url=} to web.archive.org") ia_headers = { @@ -66,20 +64,17 @@ class WaybackArchiver(Archiver): return self.custom_retry(status_json, screenshot=screenshot) archive_url = f"https://web.archive.org/web/{status_json['timestamp']}/{status_json['original_url']}" - return self.if_archived_return_with_screenshot(url, archive_url) - def if_archived_return_with_screenshot(self, url, archive_url, screenshot=None, req=None, status='success'): try: - if req is None: - req = requests.get(archive_url) + req = requests.get(archive_url) parsed = BeautifulSoup(req.content, 'html.parser') title = parsed.find_all('title')[0].text if title == 'Wayback Machine': title = 'Could not get title' except: title = "Could not get title" - screenshot = screenshot or self.get_screenshot(url) - self.seen_urls[url] = ArchiveResult(status=status, cdn_url=archive_url, title=title, screenshot=screenshot) + screenshot = self.get_screenshot(url) + self.seen_urls[url] = ArchiveResult(status='success', cdn_url=archive_url, title=title, screenshot=screenshot) return self.seen_urls[url] def custom_retry(self, json_data, **kwargs): From dc60bb15585f08105d4022031f68938ce342bd5d Mon Sep 17 00:00:00 2001 From: msramalho <19508417+msramalho@users.noreply.github.com> Date: Tue, 14 Jun 2022 21:18:18 +0200 Subject: [PATCH 58/84] json -> yaml --- .gitignore | 2 + Pipfile | 1 + Pipfile.lock | 127 +++++++++++++++++++++++++++++--------------- README.md | 20 +++---- configs/config.py | 22 ++++---- example.config.json | 62 --------------------- example.config.yaml | 81 ++++++++++++++++++++++++++++ 7 files changed, 188 insertions(+), 127 deletions(-) delete mode 100644 example.config.json create mode 100644 example.config.yaml diff --git a/.gitignore b/.gitignore index 9084d64..4b7e9ce 100644 --- a/.gitignore +++ b/.gitignore @@ -12,5 +12,7 @@ anu.html anon* config.json config-*.json +config.yaml +config-*.yaml logs/* local_archive/ \ No newline at end of file diff --git a/Pipfile b/Pipfile index 9fc7aca..fedfd51 100644 --- a/Pipfile +++ b/Pipfile @@ -21,6 +21,7 @@ google-auth-httplib2 = "*" google-auth-oauthlib = "*" oauth2client = "*" python-slugify = "*" +pyyaml = "*" [requires] python_version = "3.9" diff --git a/Pipfile.lock b/Pipfile.lock index 01091af..9f1a12b 100644 --- a/Pipfile.lock +++ b/Pipfile.lock @@ -1,7 +1,7 @@ { "_meta": { "hash": { - "sha256": "e13fa011edc8726b15cc2a3ef30cd73a71ff33830ca853f6a5e7641f0a9a6f91" + "sha256": "602a05a8fa475181c24714ab57188a417fdfddf373a7dab4fa0ba0fcb7ce8d0a" }, "pipfile-spec": 6, "requires": { @@ -50,19 +50,19 @@ }, "boto3": { "hashes": [ - "sha256:1bc562393d7985263e62828173eea6c7d61562031c646dc857a4f0fad1dfddbe", - "sha256:7625c5ed92bb7a953e03d2541bcbfcb66c3495f8d7b9421e47b4e2c280dc9162" + "sha256:28ab0947c49a6fb2409004d4a10b2828aec231cb95ca1d800cb1411e191cc201", + "sha256:833e67edfb73f2cc22ff27a1c33728686dc90a9e81ba2551f9462ea2d1b04f41" ], "index": "pypi", - "version": "==1.24.3" + "version": "==1.24.8" }, "botocore": { "hashes": [ - "sha256:2d48f4ed77220d4cb6f1b1abbb1b782d1b12260645f6ba3f3cd9ae5c98546297", - "sha256:7be5962b956b5770799ba87b0bd2173230068d269982bdf8d16fabaa79483912" + "sha256:ad92702930d6cb7b587fc2f619672feb74d5218f8de387a28c2905820db79027", + "sha256:db6667b8dfd175d16187653942cd91dd1f0cf36adc0ea9d7a0805ba4d2a3321f" ], - "markers": "python_full_version >= '3.7.0'", - "version": "==1.27.3" + "markers": "python_version >= '3.7'", + "version": "==1.27.8" }, "brotli": { "hashes": [ @@ -215,7 +215,7 @@ "sha256:2857e29ff0d34db842cd7ca3230549d1a697f96ee6d3fb071cfa6c7393832597", "sha256:6881edbebdb17b39b4eaaa821b438bf6eddffb4468cf344f09f89def34a8b1df" ], - "markers": "python_version >= '3'", + "markers": "python_version >= '3.5'", "version": "==2.0.12" }, "click": { @@ -223,7 +223,7 @@ "sha256:7682dc8afb30297001674575ea00d1814d808d6a36af415a82bd481d37ba7b8e", "sha256:bb4d8133cb15a609f44e8213d9b391b0809795062913b383c62be0ee95b1db48" ], - "markers": "python_full_version >= '3.7.0'", + "markers": "python_version >= '3.7'", "version": "==8.1.3" }, "cloudscraper": { @@ -280,7 +280,7 @@ "sha256:37def7b658813cda163b56fc564cdc75e86d338246458c4c28ae84cabefa2404", "sha256:3a0fd85166ad9dbab54c9aec96737b744106dc5f15c0b09a6744a445299fcf04" ], - "markers": "python_full_version >= '3.7.0'", + "markers": "python_version >= '3.7'", "version": "==3.7.1" }, "flask": { @@ -288,7 +288,7 @@ "sha256:315ded2ddf8a6281567edb27393010fe3406188bafbfe65a3339d5787d89e477", "sha256:fad5b446feb0d6db6aec0c3184d16a8c1f6c3e464b511649c8918a9be100b4fe" ], - "markers": "python_full_version >= '3.7.0'", + "markers": "python_version >= '3.7'", "version": "==2.1.2" }, "future": { @@ -308,19 +308,19 @@ }, "google-api-python-client": { "hashes": [ - "sha256:159aa2d5f67998f39b06f28f38d6621389dda099c56f0fde46e9070dabdd5b40", - "sha256:a45fd3f318f79b3498d31de7e7db16d70b01672a755c88f56841183db908c576" + "sha256:a573373041b3f6ccbd04877b70e7425c52daec5b4fe5f440e8f5895c87d1a69c", + "sha256:b444f839bed289ecfe30950ea1cd15b7e7976d8cf9f0a3c778037ae3fb030df3" ], "index": "pypi", - "version": "==2.50.0" + "version": "==2.51.0" }, "google-auth": { "hashes": [ - "sha256:1ba4938e032b73deb51e59c4656a00e0939cf0b1112575099f136babb4563312", - "sha256:349ac49b18b01019453cc99c11c92ed772739778c92f184002b7ab3a5b7ac77d" + "sha256:8a954960f852d5f19e6af14dd8e75c20159609e85d8db37e4013cc8c3824a7e1", + "sha256:df549a1433108801b11bdcc0e312eaf0d5f0500db42f0523e4d65c78722e8475" ], "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4, 3.5'", - "version": "==2.6.6" + "version": "==2.7.0" }, "google-auth-httplib2": { "hashes": [ @@ -332,11 +332,11 @@ }, "google-auth-oauthlib": { "hashes": [ - "sha256:24f67735513c4c7134dbde2f1dee5a1deb6acc8dfcb577d7bff30d213a28e7b0", - "sha256:30596b824fc6808fdaca2f048e4998cc40fb4b3599eaea66d28dc7085b36c5b8" + "sha256:6d6161d0ec0a62e2abf2207c6071c117ec5897b300823c4bb2d963ee86e20e4f", + "sha256:d5e98a71203330699f92a26bc08847a92e8c3b1b8d82a021f1af34164db143ae" ], "index": "pypi", - "version": "==0.5.1" + "version": "==0.5.2" }, "googleapis-common-protos": { "hashes": [ @@ -375,7 +375,7 @@ "sha256:84d9dd047ffa80596e0f246e2eab0b391788b0503584e8945f2368256d2735ff", "sha256:9d643ff0a55b762d5cdb124b8eaa99c66322e2157b69160bc32796e824360e6d" ], - "markers": "python_version >= '3'", + "markers": "python_version >= '3.5'", "version": "==3.3" }, "importlib-metadata": { @@ -391,7 +391,7 @@ "sha256:2c2349112351b88699d8d4b6b075022c0808887cb7ad10069318a8b0bc88db44", "sha256:5dbbc68b317e5e42f327f9021763545dc3fc3bfe22e6deb96aaf1fc38874156a" ], - "markers": "python_full_version >= '3.7.0'", + "markers": "python_version >= '3.7'", "version": "==2.1.2" }, "jinja2": { @@ -399,7 +399,7 @@ "sha256:31351a702a408a9e7595a8fc6150fc3f43bb6bf7e319770cbc0db9df9437e852", "sha256:6088930bfe239f0e6710546ab9c19c9ef35e29792895fed6e6e31a023a182a61" ], - "markers": "python_full_version >= '3.7.0'", + "markers": "python_version >= '3.7'", "version": "==3.1.2" }, "jmespath": { @@ -407,7 +407,7 @@ "sha256:a490e280edd1f57d6de88636992d05b71e97d69a26a19f058ecf7d304474bf5e", "sha256:e8dcd576ed616f14ec02eed0005c85973b5890083313860136657e24784e4c04" ], - "markers": "python_full_version >= '3.7.0'", + "markers": "python_version >= '3.7'", "version": "==1.0.0" }, "loguru": { @@ -530,7 +530,7 @@ "sha256:f121a1420d4e173a5d96e47e9a0c0dcff965afdf1626d28de1460815f7c4ee7a", "sha256:fc7b548b17d238737688817ab67deebb30e8073c95749d55538ed473130ec0c7" ], - "markers": "python_full_version >= '3.7.0'", + "markers": "python_version >= '3.7'", "version": "==2.1.1" }, "mutagen": { @@ -559,11 +559,11 @@ }, "outcome": { "hashes": [ - "sha256:c7dd9375cfd3c12db9801d080a3b63d4b0a261aa996c4c13152380587288d958", - "sha256:e862f01d4e626e63e8f92c38d1f8d5546d3f9cce989263c521b2e7990d186967" + "sha256:6f82bd3de45da303cf1f771ecafa1633750a358436a8bb60e06a1ceb745d2672", + "sha256:c4ab89a56575d6d38a05aa16daeaa333109c1f96167aba8901ab18b6b5e0f7f5" ], - "markers": "python_version >= '3.6'", - "version": "==1.1.0" + "markers": "python_version >= '3.7'", + "version": "==1.2.0" }, "protobuf": { "hashes": [ @@ -592,7 +592,7 @@ "sha256:e250a42f15bf9d5b09fe1b293bdba2801cd520a9f5ea2d7fb7536d4441811d20", "sha256:ff8d8fa42675249bb456f5db06c00de6c2f4c27a065955917b28c4f15978b9c3" ], - "markers": "python_full_version >= '3.7.0'", + "markers": "python_version >= '3.7'", "version": "==3.20.1" }, "pyaes": { @@ -724,13 +724,52 @@ "index": "pypi", "version": "==6.1.2" }, + "pyyaml": { + "hashes": [ + "sha256:0283c35a6a9fbf047493e3a0ce8d79ef5030852c51e9d911a27badfde0605293", + "sha256:055d937d65826939cb044fc8c9b08889e8c743fdc6a32b33e2390f66013e449b", + "sha256:07751360502caac1c067a8132d150cf3d61339af5691fe9e87803040dbc5db57", + "sha256:0b4624f379dab24d3725ffde76559cff63d9ec94e1736b556dacdfebe5ab6d4b", + "sha256:0ce82d761c532fe4ec3f87fc45688bdd3a4c1dc5e0b4a19814b9009a29baefd4", + "sha256:1e4747bc279b4f613a09eb64bba2ba602d8a6664c6ce6396a4d0cd413a50ce07", + "sha256:213c60cd50106436cc818accf5baa1aba61c0189ff610f64f4a3e8c6726218ba", + "sha256:231710d57adfd809ef5d34183b8ed1eeae3f76459c18fb4a0b373ad56bedcdd9", + "sha256:277a0ef2981ca40581a47093e9e2d13b3f1fbbeffae064c1d21bfceba2030287", + "sha256:2cd5df3de48857ed0544b34e2d40e9fac445930039f3cfe4bcc592a1f836d513", + "sha256:40527857252b61eacd1d9af500c3337ba8deb8fc298940291486c465c8b46ec0", + "sha256:473f9edb243cb1935ab5a084eb238d842fb8f404ed2193a915d1784b5a6b5fc0", + "sha256:48c346915c114f5fdb3ead70312bd042a953a8ce5c7106d5bfb1a5254e47da92", + "sha256:50602afada6d6cbfad699b0c7bb50d5ccffa7e46a3d738092afddc1f9758427f", + "sha256:68fb519c14306fec9720a2a5b45bc9f0c8d1b9c72adf45c37baedfcd949c35a2", + "sha256:77f396e6ef4c73fdc33a9157446466f1cff553d979bd00ecb64385760c6babdc", + "sha256:819b3830a1543db06c4d4b865e70ded25be52a2e0631ccd2f6a47a2822f2fd7c", + "sha256:897b80890765f037df3403d22bab41627ca8811ae55e9a722fd0392850ec4d86", + "sha256:98c4d36e99714e55cfbaaee6dd5badbc9a1ec339ebfc3b1f52e293aee6bb71a4", + "sha256:9df7ed3b3d2e0ecfe09e14741b857df43adb5a3ddadc919a2d94fbdf78fea53c", + "sha256:9fa600030013c4de8165339db93d182b9431076eb98eb40ee068700c9c813e34", + "sha256:a80a78046a72361de73f8f395f1f1e49f956c6be882eed58505a15f3e430962b", + "sha256:b3d267842bf12586ba6c734f89d1f5b871df0273157918b0ccefa29deb05c21c", + "sha256:b5b9eccad747aabaaffbc6064800670f0c297e52c12754eb1d976c57e4f74dcb", + "sha256:c5687b8d43cf58545ade1fe3e055f70eac7a5a1a0bf42824308d868289a95737", + "sha256:cba8c411ef271aa037d7357a2bc8f9ee8b58b9965831d9e51baf703280dc73d3", + "sha256:d15a181d1ecd0d4270dc32edb46f7cb7733c7c508857278d3d378d14d606db2d", + "sha256:d4db7c7aef085872ef65a8fd7d6d09a14ae91f691dec3e87ee5ee0539d516f53", + "sha256:d4eccecf9adf6fbcc6861a38015c2a64f38b9d94838ac1810a9023a0609e1b78", + "sha256:d67d839ede4ed1b28a4e8909735fc992a923cdb84e618544973d7dfc71540803", + "sha256:daf496c58a8c52083df09b80c860005194014c3698698d1a57cbcfa182142a3a", + "sha256:e61ceaab6f49fb8bdfaa0f92c4b57bcfbea54c09277b1b4f7ac376bfb7a7c174", + "sha256:f84fbc98b019fef2ee9a1cb3ce93e3187a6df0b2538a651bfb890254ba9f90b5" + ], + "index": "pypi", + "version": "==6.0" + }, "requests": { "hashes": [ - "sha256:68d7c56fd5a8999887728ef304a6d12edc7be74f1cfa47714fc8b414525c9a61", - "sha256:f22fa1e554c9ddfd16e6e41ac79759e17be9e492b3587efa038054674760e72d" + "sha256:bc7861137fbce630f17b03d3ad02ad0bf978c844f3536d0edda6499dafce2b6f", + "sha256:d568723a7ebd25875d8d1eaf5dfa068cd2fc8194b2e483d7b1f7c81918dbec6b" ], - "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4, 3.5'", - "version": "==2.27.1" + "markers": "python_version >= '3.7' and python_version < '4'", + "version": "==2.28.0" }, "requests-oauthlib": { "hashes": [ @@ -768,7 +807,7 @@ "sha256:06176b74f3a15f61f1b4f25a1fc29a4429040b7647133a463da8fa5bd28d5ecd", "sha256:2ed07d3866f523cc561bf4a00fc5535827981b117dd7876f036b0c1aca42c947" ], - "markers": "python_full_version >= '3.7.0'", + "markers": "python_version >= '3.7'", "version": "==0.6.0" }, "selenium": { @@ -849,11 +888,11 @@ }, "trio": { "hashes": [ - "sha256:670a52d3115d0e879e1ac838a4eb999af32f858163e3a704fe4839de2a676070", - "sha256:fb2d48e4eab0dfb786a472cd514aaadc71e3445b203bc300bad93daa75d77c1a" + "sha256:4dc0bf9d5cc78767fc4516325b6d80cc0968705a31d0eec2ecd7cdda466265b0", + "sha256:523f39b7b69eef73501cebfe1aafd400a9aad5b03543a0eded52952488ff1c13" ], - "markers": "python_full_version >= '3.7.0'", - "version": "==0.20.0" + "markers": "python_version >= '3.7'", + "version": "==0.21.0" }, "trio-websocket": { "hashes": [ @@ -934,7 +973,7 @@ "sha256:fab7c640815812ed5f10fbee7abbf58788d602046b7bb3af9b1ac753a6d5e916", "sha256:fc06cc8073c8e87072138ba1e431300e2d408f054b27047d047b549455066ff4" ], - "markers": "python_full_version >= '3.7.0'", + "markers": "python_version >= '3.7'", "version": "==10.3" }, "werkzeug": { @@ -942,7 +981,7 @@ "sha256:1ce08e8093ed67d638d63879fd1ba3735817f7a80de3674d293f5984f25fb6e6", "sha256:72a4b735692dd3135217911cbeaa1be5fa3f62bffb8745c5215420a03dc55255" ], - "markers": "python_full_version >= '3.7.0'", + "markers": "python_version >= '3.7'", "version": "==2.1.2" }, "wsproto": { @@ -950,7 +989,7 @@ "sha256:2218cb57952d90b9fca325c0dcfb08c3bda93e8fd8070b0a17f048e2e47a521b", "sha256:a2e56bfd5c7cd83c1369d83b5feccd6d37798b74872866e62616e0ecf111bda8" ], - "markers": "python_full_version >= '3.7.0'", + "markers": "python_version >= '3.7'", "version": "==1.1.0" }, "yt-dlp": { @@ -966,7 +1005,7 @@ "sha256:56bf8aadb83c24db6c4b577e13de374ccfb67da2078beba1d037c17980bf43ad", "sha256:c4f6e5bbf48e74f7a38e7cc5b0480ff42b0ae5178957d564d18932525d5cf099" ], - "markers": "python_full_version >= '3.7.0'", + "markers": "python_version >= '3.7'", "version": "==3.8.0" } }, diff --git a/README.md b/README.md index 80ff87f..2e8edd2 100644 --- a/README.md +++ b/README.md @@ -13,7 +13,7 @@ You also need: 5. Internet Archive credentials can be retrieved from https://archive.org/account/s3.php. ### Configuration file -Configuration is done via a config.json file (see [example.config.json](example.config.json)) and some properties of that file can be overwritten via command line arguments. Here is the current result from running the `python auto_archive.py --help`: +Configuration is done via a config.yaml file (see [example.config.yaml](example.config.yaml)) and some properties of that file can be overwritten via command line arguments. Here is the current result from running the `python auto_archive.py --help`:
python auto_archive.py --help @@ -56,36 +56,36 @@ optional arguments:

#### Example invocations -All the configurations can be specified in the JSON config file, but sometimes it is useful to override only some of those like the sheet that we are running the archival on, here are some examples (possibly prepended by `pipenv run`): +All the configurations can be specified in the YAML config file, but sometimes it is useful to override only some of those like the sheet that we are running the archival on, here are some examples (possibly prepended by `pipenv run`): ```bash -# all the configurations come from config.json +# all the configurations come from config.yaml python auto_archive.py -# all the configurations come from my_config.json -python auto_archive.py --config my_config.json +# all the configurations come from my_config.yaml +python auto_archive.py --config my_config.yaml # reads the configurations but saves archived content to google drive instead -python auto_archive.py --config my_config.json --storage gd +python auto_archive.py --config my_config.yaml --storage gd # uses the configurations but for another google docs sheet # with a header on row 2 and with some different column names -python auto_archive.py --config my_config.json --sheet="use it on another sheets doc" --header=2 --col-link="put urls here" +python auto_archive.py --config my_config.yaml --sheet="use it on another sheets doc" --header=2 --col-link="put urls here" -# all the configurations come from config.json and specifies that s3 files should be private +# all the configurations come from config.yaml and specifies that s3 files should be private python auto_archive.py --s3-private ``` ### Extra notes on configuration #### Google Drive -To use Google Drive storage you need the id of the shared folder in the `config.json` file which must be shared with the service account eg `autoarchiverservice@auto-archiver-111111.iam.gserviceaccount.com` and then you can use `--storage=gd` +To use Google Drive storage you need the id of the shared folder in the `config.yaml` file which must be shared with the service account eg `autoarchiverservice@auto-archiver-111111.iam.gserviceaccount.com` and then you can use `--storage=gd` #### Telethon (Telegrams API Library) The first time you run, you will be prompted to do a authentication with the phone number associated, alternatively you can put your `anon.session` in the root. ## Running -The `--sheet name` property (or `execution.sheet` in the JSON file) is the name of the Google Sheet to check for URLs. +The `--sheet name` property (or `execution.sheet` in the YAML file) is the name of the Google Sheet to check for URLs. This sheet must have been shared with the Google Service account used by `gspread`. This sheet must also have specific columns (case-insensitive) in the `header` row (see `COLUMN_NAMES` in [gworksheet.py](utils/gworksheet.py)), only the `link` and `status` columns are mandatory: * `Link` (required): the location of the media to be archived. This is the only column that should be supplied with data initially diff --git a/configs/config.py b/configs/config.py index bfab204..8095090 100644 --- a/configs/config.py +++ b/configs/config.py @@ -1,5 +1,5 @@ -import argparse, json +import argparse, yaml, json import gspread from loguru import logger from selenium import webdriver @@ -26,7 +26,7 @@ class Config: c.parse() # parses the values and initializes the Services and API clients # you can then access the Services and APIs like 'c.s3_config' All the configurations available as cmd line options, when included, will - override the configurations in the config.json file. + override the configurations in the config.yaml file. Configurations are split between: 1. "secrets" containing API keys for generating services - not kept in memory 2. "execution" containing specific execution configurations @@ -41,12 +41,12 @@ class Config: self.args = self.parser.parse_args() logger.success(f'Command line arguments parsed successfully') self.config_file = self.args.config - self.read_config_json() + self.read_config_yaml() logger.info(f'APIs and Services initialized:\n{self}') - def read_config_json(self): + def read_config_yaml(self): with open(self.config_file, "r", encoding="utf-8") as inf: - self.config = json.load(inf) + self.config = yaml.safe_load(inf) # ----------------------EXECUTION - execution configurations execution = self.config.get("execution", {}) @@ -150,13 +150,13 @@ class Config: """ Creates the CMD line arguments. 'python auto_archive.py --help' """ - parser = argparse.ArgumentParser(description='Automatically archive social media posts, videos, and images from a Google Sheets document. The command line arguments will always override the configurations in the provided JSON config file (--config), only some high-level options are allowed via the command line and the JSON configuration file is the preferred method. The sheet must have the "url" and "status" for the archiver to work. ') + parser = argparse.ArgumentParser(description='Automatically archive social media posts, videos, and images from a Google Sheets document. The command line arguments will always override the configurations in the provided YAML config file (--config), only some high-level options are allowed via the command line and the YAML configuration file is the preferred method. The sheet must have the "url" and "status" for the archiver to work. ') - parser.add_argument('--config', action='store', dest='config', help='the filename of the JSON configuration file (defaults to \'config.json\')', default='config.json') - parser.add_argument('--storage', action='store', dest='storage', help='which storage to use [execution.storage in config.json]', choices=Config.AVAILABLE_STORAGES) - parser.add_argument('--sheet', action='store', dest='sheet', help='the name of the google sheets document [execution.sheet in config.json]') - parser.add_argument('--header', action='store', dest='header', help='1-based index for the header row [execution.header in config.json]') - parser.add_argument('--s3-private', action='store_true', help='Store content without public access permission (only for storage=s3) [secrets.s3.private in config.json]') + parser.add_argument('--config', action='store', dest='config', help='the filename of the YAML configuration file (defaults to \'config.yaml\')', default='config.yaml') + parser.add_argument('--storage', action='store', dest='storage', help='which storage to use [execution.storage in config.yaml]', choices=Config.AVAILABLE_STORAGES) + parser.add_argument('--sheet', action='store', dest='sheet', help='the name of the google sheets document [execution.sheet in config.yaml]') + parser.add_argument('--header', action='store', dest='header', help='1-based index for the header row [execution.header in config.yaml]') + parser.add_argument('--s3-private', action='store_true', help='Store content without public access permission (only for storage=s3) [secrets.s3.private in config.yaml]') for k, v in GWorksheet.COLUMN_NAMES.items(): help = f"the name of the column to FILL WITH {k} (default='{v}')" diff --git a/example.config.json b/example.config.json deleted file mode 100644 index c00865a..0000000 --- a/example.config.json +++ /dev/null @@ -1,62 +0,0 @@ -{ - "secrets": { - "s3": {// for storage=s3 - "region": "s3 region like fra1", - "bucket": "s3 bucket name like my-bucket", - "key": "s3 API key", - "secret": "s3 API secret", - "endpoint_url": "use region format like such: https://{region}.digitaloceanspaces.com", - "cdn_url": "use bucket, region, and key (key is the archived file path generated when executing) format like such: https://{bucket}.{region}.cdn.digitaloceanspaces.com/{key}", - "private": false, // if true S3 urls will not be readable online - "key_path": "random" // you can generate a random UUID for the URL instead of a predictable path, useful to still have public but unlisted files - }, - "wayback": { - "key": "your API key, visit https://archive.org/account/s3.php", - "secret": "your API secret" - }, - "telegram": { - "api_id": "your API key, see https://telegra.ph/How-to-get-Telegram-APP-ID--API-HASH-05-27", - "api_hash": "your API hash", - "bot_token": "optional, but allows access to more content such as large videos, talk to @botfather" - }, - "google_sheets": { - "service_account": "local filename: normally service_account.json, see https://gspread.readthedocs.io/en/latest/oauth2.html#for-bots-using-service-account" - }, - "google_drive": { // for storage=gd - "service_account": "local filename: can be the same or different file from google_sheets.service_account defaults to service_account.json", - "root_folder_id": "copy XXXX from https://drive.google.com/drive/folders/XXXX" - }, - "local":{ // for storage=local - "save_to": "local path to save files in ./local_archive" - }, - "facebook": { - "cookie": "optional facebook cookie to have more access to content, from browser, looks like 'cookie: datr= xxxx'" - } - }, - "execution": { - "sheet": "your-sheet-name", // can be overwritten with CMD --sheet= - "header": 1, //which row of your tabs contains the header, can be overwritten with CMD --header= - "storage": "s3", // which storage to use, can be overwritten with CMD --storage= - "selenium": { // optional configurations for the selenium browser that takes screenshots, these are the defaults - "timeout_seconds": 120, // values under 10s might mean screenshots fail to grab - "window_width": 1400, - "window_height": 2000 - }, - "tmp_folder": "tmp/", // local tmp folder to save files before uploading to storage - "save_logs": true, // puts execution logs into /logs folder, defaults to false - "column_names": { // custom column names, only needed if different from default, can be overwritten with CMD --col-NAME="VALUE" - "url": "link", - "archive": "archive location", - "folder": "folder", - "date": "archive date", - "status": "archive status", - "thumbnail": "thumbnail", - "thumbnail_index": "thumbnail index", - "timestamp": "upload timestamp", - "title": "upload title", - "duration": "duration", - "screenshot": "screenshot", - "hash": "hash" - } - } -} \ No newline at end of file diff --git a/example.config.yaml b/example.config.yaml new file mode 100644 index 0000000..a65360e --- /dev/null +++ b/example.config.yaml @@ -0,0 +1,81 @@ +--- +secrets: + # needed if you use storage=s3 + s3: + # contains S3 info on region, bucket, key and secret + region: reg1 + bucket: my-bucket + key: "s3 API key" + secret: "s3 API secret" + # use region format like such + endpoint_url: 'https://{region}.digitaloceanspaces.com' + #use bucket, region, and key (key is the archived file path generated when executing) format like such as: + cdn_url: "https://{bucket}.{region}.cdn.digitaloceanspaces.com/{key}" + # if private:true S3 urls will not be readable online + private: false + # with 'random' you can generate a random UUID for the URL instead of a predictable path, useful to still have public but unlisted files, alternative is 'default' or not omitted from config + key_path: random + + # needed if you use storage=gd + google_drive: + # local filename can be the same or different file from google_sheets.service_account, defaults to service_account.json + service_account: "service_account.json" + root_folder_id: copy XXXX from https://drive.google.com/drive/folders/XXXX + + # needed if you use storage=local + local: + # local path to save files in + save_to: "./local_archive" + + wayback: + # to get credentials visit https://archive.org/account/s3.php + key: your API key + secret: your API secret + + telegram: + # to get credentials see: https://telegra.ph/How-to-get-Telegram-APP-ID--API-HASH-05-27 + api_id: your API key, see + api_hash: your API hash + # optional, but allows access to more content such as large videos, talk to @botfather + bot_token: your bot-token + + google_sheets: + # local filename: defaults to service_account.json, see https://gspread.readthedocs.io/en/latest/oauth2.html#for-bots-using-service-account + service_account: "service_account.json" + + facebook: + # optional facebook cookie to have more access to content, from browser, looks like 'cookie: datr= xxxx' + cookie: "" +execution: + # can be overwritten with CMD --sheet= + sheet: your-sheet-name + # which row of your tabs contains the header, can be overwritten with CMD --header= + header: 1 + # which storage to use, can be overwritten with CMD --storage= + storage: s3 + # optional configurations for the selenium browser that takes screenshots, these are the defaults + selenium: + # values under 10s might mean screenshots fail to grab screenshot + timeout_seconds: 120 + window_width: 1400 + window_height: 2000 + # local tmp folder to save files before uploading to storage + tmp_folder: tmp/ + # puts execution logs into /logs folder, defaults to false + save_logs: true + # custom column names, only needed if different from default, can be overwritten with CMD --col-NAME="VALUE" + # url and status are the only columns required to be present in the google sheet + column_names: + url: link + status: archive status + archive: archive location + # use this column to override default location data + folder: folder + date: archive date + thumbnail: thumbnail + thumbnail_index: thumbnail index + timestamp: upload timestamp + title: upload title + duration: duration + screenshot: screenshot + hash: hash From a7a555ea85bde0d80a014e676ae2f006d5c01146 Mon Sep 17 00:00:00 2001 From: msramalho <19508417+msramalho@users.noreply.github.com> Date: Tue, 14 Jun 2022 21:19:45 +0200 Subject: [PATCH 59/84] refactor configs --- configs/__init__.py | 5 +++-- configs/config.py | 10 ++-------- configs/selenium_config.py | 8 ++++++++ configs/telethon_config.py | 3 ++- configs/wayback_config.py | 3 ++- 5 files changed, 17 insertions(+), 12 deletions(-) create mode 100644 configs/selenium_config.py diff --git a/configs/__init__.py b/configs/__init__.py index 4409847..5a693ca 100644 --- a/configs/__init__.py +++ b/configs/__init__.py @@ -1,3 +1,4 @@ from .config import Config -from .wayback_config import WaybackConfig -from .telethon_config import TelethonConfig \ No newline at end of file +from .selenium_config import SeleniumConfig +from .telethon_config import TelethonConfig +from .wayback_config import WaybackConfig \ No newline at end of file diff --git a/configs/config.py b/configs/config.py index 8095090..c0be658 100644 --- a/configs/config.py +++ b/configs/config.py @@ -3,21 +3,15 @@ import argparse, yaml, json import gspread from loguru import logger from selenium import webdriver -from dataclasses import dataclass, asdict +from dataclasses import asdict from utils import GWorksheet, getattr_or from .wayback_config import WaybackConfig from .telethon_config import TelethonConfig +from .selenium_config import SeleniumConfig from storages import Storage, S3Config, S3Storage, GDStorage, GDConfig, LocalStorage, LocalConfig -@dataclass -class SeleniumConfig: - timeout_seconds: int = 120 - window_width: int = 1400 - window_height: int = 2000 - - class Config: """ Controls the current execution parameters and manages API configurations diff --git a/configs/selenium_config.py b/configs/selenium_config.py new file mode 100644 index 0000000..8e060af --- /dev/null +++ b/configs/selenium_config.py @@ -0,0 +1,8 @@ +from dataclasses import dataclass + + +@dataclass +class SeleniumConfig: + timeout_seconds: int = 120 + window_width: int = 1400 + window_height: int = 2000 diff --git a/configs/telethon_config.py b/configs/telethon_config.py index 2109469..3099bb5 100644 --- a/configs/telethon_config.py +++ b/configs/telethon_config.py @@ -1,8 +1,9 @@ from dataclasses import dataclass + @dataclass class TelethonConfig: api_id: str api_hash: str - bot_token: str \ No newline at end of file + bot_token: str diff --git a/configs/wayback_config.py b/configs/wayback_config.py index 7f455ee..7770f66 100644 --- a/configs/wayback_config.py +++ b/configs/wayback_config.py @@ -1,7 +1,8 @@ from dataclasses import dataclass + @dataclass class WaybackConfig: key: str - secret: str \ No newline at end of file + secret: str From 6872d8e10353ece67c3b59fea38d520a4e00d9cc Mon Sep 17 00:00:00 2001 From: msramalho <19508417+msramalho@users.noreply.github.com> Date: Tue, 14 Jun 2022 21:37:02 +0200 Subject: [PATCH 60/84] check if exists to configuration, save_logs to command line --- auto_archive.py | 2 +- configs/config.py | 8 +++++++- example.config.yaml | 2 ++ 3 files changed, 10 insertions(+), 2 deletions(-) diff --git a/auto_archive.py b/auto_archive.py index 713928b..8c5643a 100644 --- a/auto_archive.py +++ b/auto_archive.py @@ -102,7 +102,7 @@ def process_sheet(c: Config): logger.debug(f'Trying {archiver} on {row=}') try: - result = archiver.download(url, check_if_exists=True) + result = archiver.download(url, check_if_exists=c.check_if_exists) except KeyboardInterrupt as e: raise e # so the higher level catch can catch it except Exception as e: result = False diff --git a/configs/config.py b/configs/config.py index c0be658..dfe786c 100644 --- a/configs/config.py +++ b/configs/config.py @@ -50,8 +50,10 @@ class Config: self.header = int(getattr_or(self.args, "header", execution.get("header", 1))) Storage.TMP_FOLDER = execution.get("tmp_folder", Storage.TMP_FOLDER) self.storage = getattr_or(self.args, "storage", execution.get("storage", "s3")) - if getattr_or(self.args, "save_logs", False): + self.save_logs = getattr(self.args, "save_logs") or execution.get("save_logs", False) + if self.save_logs: self.set_log_files() + self.check_if_exists = getattr(self.args, "check_if_exists") or execution.get("check_if_exists", False) # Column names come from config and can be overwritten by CMD # in the end all are considered as lower case @@ -150,6 +152,8 @@ class Config: parser.add_argument('--storage', action='store', dest='storage', help='which storage to use [execution.storage in config.yaml]', choices=Config.AVAILABLE_STORAGES) parser.add_argument('--sheet', action='store', dest='sheet', help='the name of the google sheets document [execution.sheet in config.yaml]') parser.add_argument('--header', action='store', dest='header', help='1-based index for the header row [execution.header in config.yaml]') + parser.add_argument('--check-if-exists', action='store_true', dest='check_if_exists', help='when possible checks if the URL has been archived before and does not archive the same URL twice [exceution.check_if_exists]') + parser.add_argument('--save-logs', action='store_true', dest='save_logs', help='creates or appends execution logs to files logs/LEVEL.log [exceution.save_logs]') parser.add_argument('--s3-private', action='store_true', help='Store content without public access permission (only for storage=s3) [secrets.s3.private in config.yaml]') for k, v in GWorksheet.COLUMN_NAMES.items(): @@ -210,6 +214,8 @@ class Config: "sheet": self.sheet, "storage": self.storage, "header": self.header, + "check_if_exists": self.check_if_exists, + "save_logs": self.save_logs, "tmp_folder": Storage.TMP_FOLDER, "selenium_config": asdict(self.selenium_config), "selenium_webdriver": self.webdriver != None, diff --git a/example.config.yaml b/example.config.yaml index a65360e..0c568c2 100644 --- a/example.config.yaml +++ b/example.config.yaml @@ -53,6 +53,8 @@ execution: header: 1 # which storage to use, can be overwritten with CMD --storage= storage: s3 + # defaults to false, when true will try to avoid duplicate URL archives + check_if_exists: true # optional configurations for the selenium browser that takes screenshots, these are the defaults selenium: # values under 10s might mean screenshots fail to grab screenshot From 3bffee41a016b06ff8d337fd7b6c9dace20e9800 Mon Sep 17 00:00:00 2001 From: msramalho <19508417+msramalho@users.noreply.github.com> Date: Tue, 14 Jun 2022 21:40:04 +0200 Subject: [PATCH 61/84] README updates --- README.md | 29 +++++++++++++++++++---------- 1 file changed, 19 insertions(+), 10 deletions(-) diff --git a/README.md b/README.md index 2e8edd2..39204a2 100644 --- a/README.md +++ b/README.md @@ -20,26 +20,31 @@ Configuration is done via a config.yaml file (see [example.config.yaml](example. ```js -usage: auto_archive.py [-h] [--config CONFIG] [--storage {s3,local,gd}] [--sheet SHEET] [--header HEADER] [--s3-private] [--col-url URL] [--col-folder FOLDER] [--col-archive ARCHIVE] [--col-date DATE] [--col-status STATUS] [--col-thumbnail THUMBNAIL] [--col-thumbnail_index THUMBNAIL_INDEX] [--col-timestamp TIMESTAMP] [--col-title TITLE] [--col-duration DURATION] [--col-screenshot SCREENSHOT] [--col-hash HASH] +usage: auto_archive.py [-h] [--config CONFIG] [--storage {s3,local,gd}] [--sheet SHEET] [--header HEADER] [--check-if-exists] [--save-logs] [--s3-private] [--col-url URL] [--col-status STATUS] [--col-folder FOLDER] + [--col-archive ARCHIVE] [--col-date DATE] [--col-thumbnail THUMBNAIL] [--col-thumbnail_index THUMBNAIL_INDEX] [--col-timestamp TIMESTAMP] [--col-title TITLE] [--col-duration DURATION] + [--col-screenshot SCREENSHOT] [--col-hash HASH] -Automatically archive social media posts, videos, and images from a Google Sheets document. The command line arguments will always override the configurations in the provided JSON config -file (--config), only some high-level options are allowed via the command line and the JSON configuration file is the preferred method. +Automatically archive social media posts, videos, and images from a Google Sheets document. +The command line arguments will always override the configurations in the provided YAML config file (--config), only some high-level options +are allowed via the command line and the YAML configuration file is the preferred method. The sheet must have the "url" and "status" for the archiver to work. optional arguments: -h, --help show this help message and exit - --config CONFIG the filename of the JSON configuration file (defaults to 'config.json') + --config CONFIG the filename of the YAML configuration file (defaults to 'config.yaml') --storage {s3,local,gd} - which storage to use [execution.storage in config.json] - --sheet SHEET the name of the google sheets document [execution.sheet in config.json] - --header HEADER 1-based index for the header row [execution.header in config.json] - --s3-private Store content without public access permission (only for storage=s3) [secrets.s3.private in config.json] + which storage to use [execution.storage in config.yaml] + --sheet SHEET the name of the google sheets document [execution.sheet in config.yaml] + --header HEADER 1-based index for the header row [execution.header in config.yaml] + --check-if-exists when possible checks if the URL has been archived before and does not archive the same URL twice [exceution.check_if_exists] + --save-logs creates or appends execution logs to files logs/LEVEL.log [exceution.save_logs] + --s3-private Store content without public access permission (only for storage=s3) [secrets.s3.private in config.yaml] --col-url URL the name of the column to READ url FROM (default='link') + --col-status STATUS the name of the column to FILL WITH status (default='archive status') --col-folder FOLDER the name of the column to READ folder FROM (default='destination folder') --col-archive ARCHIVE the name of the column to FILL WITH archive (default='archive location') --col-date DATE the name of the column to FILL WITH date (default='archive date') - --col-status STATUS the name of the column to FILL WITH status (default='archive status') - --col-thumbnail THUMBNAIL + --col-thumbnail THUMBNAIL the name of the column to FILL WITH thumbnail (default='thumbnail') --col-thumbnail_index THUMBNAIL_INDEX the name of the column to FILL WITH thumbnail_index (default='thumbnail index') @@ -62,6 +67,10 @@ All the configurations can be specified in the YAML config file, but sometimes i # all the configurations come from config.yaml python auto_archive.py +# all the configurations come from config.yaml, +# checks if URL is not archived twice and saves logs to logs/ folder +python auto_archive.py --check-if-exists --save_logs + # all the configurations come from my_config.yaml python auto_archive.py --config my_config.yaml From 59afe7fd6305a036b1fad5706e5d22b929786579 Mon Sep 17 00:00:00 2001 From: msramalho <19508417+msramalho@users.noreply.github.com> Date: Wed, 15 Jun 2022 16:38:18 +0200 Subject: [PATCH 62/84] vk-archiver implemented --- .gitignore | 3 +- Pipfile | 2 + Pipfile.lock | 166 ++++++++++++++++++++++++++++++++++----- archivers/__init__.py | 3 +- archivers/vk_archiver.py | 72 +++++++++++++++++ auto_archive.py | 3 +- configs/__init__.py | 3 +- configs/config.py | 14 ++++ configs/vk_config.py | 8 ++ 9 files changed, 251 insertions(+), 23 deletions(-) create mode 100644 archivers/vk_archiver.py create mode 100644 configs/vk_config.py diff --git a/.gitignore b/.gitignore index 4b7e9ce..8501c5d 100644 --- a/.gitignore +++ b/.gitignore @@ -15,4 +15,5 @@ config-*.json config.yaml config-*.yaml logs/* -local_archive/ \ No newline at end of file +local_archive/ +vk_config*.json \ No newline at end of file diff --git a/Pipfile b/Pipfile index fedfd51..1b55d86 100644 --- a/Pipfile +++ b/Pipfile @@ -22,6 +22,8 @@ google-auth-oauthlib = "*" oauth2client = "*" python-slugify = "*" pyyaml = "*" +vk-api = "*" +dateparser = "*" [requires] python_version = "3.9" diff --git a/Pipfile.lock b/Pipfile.lock index 9f1a12b..0b911f3 100644 --- a/Pipfile.lock +++ b/Pipfile.lock @@ -1,7 +1,7 @@ { "_meta": { "hash": { - "sha256": "602a05a8fa475181c24714ab57188a417fdfddf373a7dab4fa0ba0fcb7ce8d0a" + "sha256": "d06498403429a8fffcd6d049b314872c0095abee7fb9c6ffd3ba3d7b0c31c8cd" }, "pipfile-spec": 6, "requires": { @@ -50,19 +50,19 @@ }, "boto3": { "hashes": [ - "sha256:28ab0947c49a6fb2409004d4a10b2828aec231cb95ca1d800cb1411e191cc201", - "sha256:833e67edfb73f2cc22ff27a1c33728686dc90a9e81ba2551f9462ea2d1b04f41" + "sha256:0821212ff521cb934801b1f655cef3c0e976775324b1018f1751700d0f42dbb4", + "sha256:87d34861727699c795bf8d65703f2435e75f12879bdd483e08b35b7c5510e8c8" ], "index": "pypi", - "version": "==1.24.8" + "version": "==1.24.9" }, "botocore": { "hashes": [ - "sha256:ad92702930d6cb7b587fc2f619672feb74d5218f8de387a28c2905820db79027", - "sha256:db6667b8dfd175d16187653942cd91dd1f0cf36adc0ea9d7a0805ba4d2a3321f" + "sha256:5669b982b0583e73daef1fe0a4df311055e6287326f857dbb1dcc2de1d8412ad", + "sha256:7a7588b0170e571317496ac4104803329d5bc792bc008e8a757ffd440f1b6fa6" ], "markers": "python_version >= '3.7'", - "version": "==1.27.8" + "version": "==1.27.9" }, "brotli": { "hashes": [ @@ -152,7 +152,7 @@ "sha256:9c5705e395cd70084351dd8ad5c41e65655e08ce46f2ec9cf6c2c08390f71eb7", "sha256:f1d53542ee8cbedbe2118b5686372fb33c297fcd6379b050cca0ef13a597382a" ], - "markers": "python_version >= '3.6'", + "markers": "python_full_version >= '3.6.0'", "version": "==2022.5.18.1" }, "cffi": { @@ -267,6 +267,14 @@ ], "version": "==37.0.2" }, + "dateparser": { + "hashes": [ + "sha256:038196b1f12c7397e38aad3d61588833257f6f552baa63a1499e6987fa8d42d9", + "sha256:9600874312ff28a41f96ec7ccdc73be1d1c44435719da47fea3339d55ff5a628" + ], + "index": "pypi", + "version": "==1.1.1" + }, "ffmpeg-python": { "hashes": [ "sha256:65225db34627c578ef0e11c8b1eb528bb35e024752f6f10b78c011f6f64c4127", @@ -303,7 +311,7 @@ "sha256:958024c6aa3460b08f35741231076a4dd9a4c819a6a39d44da9627febe8b28f0", "sha256:ce1daa49644b50398093d2a9ad886501aa845e2602af70c3001b9f402a9d7359" ], - "markers": "python_version >= '3.6'", + "markers": "python_full_version >= '3.6.0'", "version": "==2.8.1" }, "google-api-python-client": { @@ -316,11 +324,11 @@ }, "google-auth": { "hashes": [ - "sha256:8a954960f852d5f19e6af14dd8e75c20159609e85d8db37e4013cc8c3824a7e1", - "sha256:df549a1433108801b11bdcc0e312eaf0d5f0500db42f0523e4d65c78722e8475" + "sha256:819b70140d05501739e1387291d39f0de3b4dff3b00ae4aff8e7a05369957f89", + "sha256:9b1da39ab8731c3061f36fefde9f8bb902dbee9eb28e3a67e8cfa7dc1be76227" ], "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4, 3.5'", - "version": "==2.7.0" + "version": "==2.8.0" }, "google-auth-httplib2": { "hashes": [ @@ -343,7 +351,7 @@ "sha256:023eaea9d8c1cceccd9587c6af6c20f33eeeb05d4148670f2b0322dc1511700c", "sha256:b09b56f5463070c2153753ef123f07d2e49235e89148e9b2459ec8ed2f68d7d3" ], - "markers": "python_version >= '3.6'", + "markers": "python_full_version >= '3.6.0'", "version": "==1.56.2" }, "gspread": { @@ -359,7 +367,7 @@ "sha256:70813c1135087a248a4d38cc0e1a0181ffab2188141a93eaf567940c3957ff06", "sha256:8ddd78563b633ca55346c8cd41ec0af27d3c79931828beffb46ce70a379e7442" ], - "markers": "python_version >= '3.6'", + "markers": "python_full_version >= '3.6.0'", "version": "==0.13.0" }, "httplib2": { @@ -554,7 +562,7 @@ "sha256:23a8208d75b902797ea29fd31fa80a15ed9dc2c6c16fe73f5d346f83f6fa27a2", "sha256:6db33440354787f9b7f3a6dbd4febf5d0f93758354060e802f6c06cb493022fe" ], - "markers": "python_version >= '3.6'", + "markers": "python_full_version >= '3.6.0'", "version": "==3.2.0" }, "outcome": { @@ -682,7 +690,7 @@ "sha256:5eb116118f9612ff1ee89ac96437bb6b49e8f04d8a13b514ba26f620208e26eb", "sha256:dc9c10fb40944260f6ed4c688ece0cd2048414940f1cea51b8b226318411c519" ], - "markers": "python_version >= '3.6'", + "markers": "python_full_version >= '3.6.0'", "version": "==2.12.0" }, "pyopenssl": { @@ -724,6 +732,21 @@ "index": "pypi", "version": "==6.1.2" }, + "pytz": { + "hashes": [ + "sha256:1e760e2fe6a8163bc0b3d9a19c4f84342afa0a2affebfaa84b01b978a02ecaa7", + "sha256:e68985985296d9a66a881eb3193b0906246245294a881e7c8afe623866ac6a5c" + ], + "version": "==2022.1" + }, + "pytz-deprecation-shim": { + "hashes": [ + "sha256:8314c9692a636c8eb3bda879b9f119e350e93223ae83e70e80c31675a0fdc1a6", + "sha256:af097bae1b616dde5c5744441e2ddc69e74dfdcb0c263129610d85b87445a59d" + ], + "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4, 3.5'", + "version": "==0.1.0.post0" + }, "pyyaml": { "hashes": [ "sha256:0283c35a6a9fbf047493e3a0ce8d79ef5030852c51e9d911a27badfde0605293", @@ -763,7 +786,88 @@ "index": "pypi", "version": "==6.0" }, + "regex": { + "hashes": [ + "sha256:0008650041531d0eadecc96a73d37c2dc4821cf51b0766e374cb4f1ddc4e1c14", + "sha256:03299b0bcaa7824eb7c0ebd7ef1e3663302d1b533653bfe9dc7e595d453e2ae9", + "sha256:06b1df01cf2aef3a9790858af524ae2588762c8a90e784ba00d003f045306204", + "sha256:09b4b6ccc61d4119342b26246ddd5a04accdeebe36bdfe865ad87a0784efd77f", + "sha256:0be0c34a39e5d04a62fd5342f0886d0e57592a4f4993b3f9d257c1f688b19737", + "sha256:0d96eec8550fd2fd26f8e675f6d8b61b159482ad8ffa26991b894ed5ee19038b", + "sha256:0eb0e2845e81bdea92b8281a3969632686502565abf4a0b9e4ab1471c863d8f3", + "sha256:13bbf0c9453c6d16e5867bda7f6c0c7cff1decf96c5498318bb87f8136d2abd4", + "sha256:17e51ad1e6131c496b58d317bc9abec71f44eb1957d32629d06013a21bc99cac", + "sha256:1977bb64264815d3ef016625adc9df90e6d0e27e76260280c63eca993e3f455f", + "sha256:1e30762ddddb22f7f14c4f59c34d3addabc789216d813b0f3e2788d7bcf0cf29", + "sha256:1e73652057473ad3e6934944af090852a02590c349357b79182c1b681da2c772", + "sha256:20e6a27959f162f979165e496add0d7d56d7038237092d1aba20b46de79158f1", + "sha256:286ff9ec2709d56ae7517040be0d6c502642517ce9937ab6d89b1e7d0904f863", + "sha256:297c42ede2c81f0cb6f34ea60b5cf6dc965d97fa6936c11fc3286019231f0d66", + "sha256:320c2f4106962ecea0f33d8d31b985d3c185757c49c1fb735501515f963715ed", + "sha256:35ed2f3c918a00b109157428abfc4e8d1ffabc37c8f9abc5939ebd1e95dabc47", + "sha256:3d146e5591cb67c5e836229a04723a30af795ef9b70a0bbd913572e14b7b940f", + "sha256:42bb37e2b2d25d958c25903f6125a41aaaa1ed49ca62c103331f24b8a459142f", + "sha256:42d6007722d46bd2c95cce700181570b56edc0dcbadbfe7855ec26c3f2d7e008", + "sha256:43eba5c46208deedec833663201752e865feddc840433285fbadee07b84b464d", + "sha256:452519bc4c973e961b1620c815ea6dd8944a12d68e71002be5a7aff0a8361571", + "sha256:4b9c16a807b17b17c4fa3a1d8c242467237be67ba92ad24ff51425329e7ae3d0", + "sha256:5510932596a0f33399b7fff1bd61c59c977f2b8ee987b36539ba97eb3513584a", + "sha256:55820bc631684172b9b56a991d217ec7c2e580d956591dc2144985113980f5a3", + "sha256:57484d39447f94967e83e56db1b1108c68918c44ab519b8ecfc34b790ca52bf7", + "sha256:58ba41e462653eaf68fc4a84ec4d350b26a98d030be1ab24aba1adcc78ffe447", + "sha256:5bc5f921be39ccb65fdda741e04b2555917a4bced24b4df14eddc7569be3b493", + "sha256:5dcc4168536c8f68654f014a3db49b6b4a26b226f735708be2054314ed4964f4", + "sha256:5f92a7cdc6a0ae2abd184e8dfd6ef2279989d24c85d2c85d0423206284103ede", + "sha256:67250b36edfa714ba62dc62d3f238e86db1065fccb538278804790f578253640", + "sha256:6df070a986fc064d865c381aecf0aaff914178fdf6874da2f2387e82d93cc5bd", + "sha256:729aa8ca624c42f309397c5fc9e21db90bf7e2fdd872461aabdbada33de9063c", + "sha256:72bc3a5effa5974be6d965ed8301ac1e869bc18425c8a8fac179fbe7876e3aee", + "sha256:74d86e8924835f863c34e646392ef39039405f6ce52956d8af16497af4064a30", + "sha256:79e5af1ff258bc0fe0bdd6f69bc4ae33935a898e3cbefbbccf22e88a27fa053b", + "sha256:7b103dffb9f6a47ed7ffdf352b78cfe058b1777617371226c1894e1be443afec", + "sha256:83f03f0bd88c12e63ca2d024adeee75234d69808b341e88343b0232329e1f1a1", + "sha256:86d7a68fa53688e1f612c3246044157117403c7ce19ebab7d02daf45bd63913e", + "sha256:878c626cbca3b649e14e972c14539a01191d79e58934e3f3ef4a9e17f90277f8", + "sha256:878f5d649ba1db9f52cc4ef491f7dba2d061cdc48dd444c54260eebc0b1729b9", + "sha256:87bc01226cd288f0bd9a4f9f07bf6827134dc97a96c22e2d28628e824c8de231", + "sha256:8babb2b5751105dc0aef2a2e539f4ba391e738c62038d8cb331c710f6b0f3da7", + "sha256:91e0f7e7be77250b808a5f46d90bf0032527d3c032b2131b63dee54753a4d729", + "sha256:9557545c10d52c845f270b665b52a6a972884725aa5cf12777374e18f2ea8960", + "sha256:9ccb0a4ab926016867260c24c192d9df9586e834f5db83dfa2c8fffb3a6e5056", + "sha256:9d828c5987d543d052b53c579a01a52d96b86f937b1777bbfe11ef2728929357", + "sha256:9efa41d1527b366c88f265a227b20bcec65bda879962e3fc8a2aee11e81266d7", + "sha256:aaf5317c961d93c1a200b9370fb1c6b6836cc7144fef3e5a951326912bf1f5a3", + "sha256:ab69b4fe09e296261377d209068d52402fb85ef89dc78a9ac4a29a895f4e24a7", + "sha256:ad397bc7d51d69cb07ef89e44243f971a04ce1dca9bf24c992c362406c0c6573", + "sha256:ae17fc8103f3b63345709d3e9654a274eee1c6072592aec32b026efd401931d0", + "sha256:af4d8cc28e4c7a2f6a9fed544228c567340f8258b6d7ea815b62a72817bbd178", + "sha256:b22ff939a8856a44f4822da38ef4868bd3a9ade22bb6d9062b36957c850e404f", + "sha256:b549d851f91a4efb3e65498bd4249b1447ab6035a9972f7fc215eb1f59328834", + "sha256:be319f4eb400ee567b722e9ea63d5b2bb31464e3cf1b016502e3ee2de4f86f5c", + "sha256:c0446b2871335d5a5e9fcf1462f954586b09a845832263db95059dcd01442015", + "sha256:c68d2c04f7701a418ec2e5631b7f3552efc32f6bcc1739369c6eeb1af55f62e0", + "sha256:c87ac58b9baaf50b6c1b81a18d20eda7e2883aa9a4fb4f1ca70f2e443bfcdc57", + "sha256:caa2734ada16a44ae57b229d45091f06e30a9a52ace76d7574546ab23008c635", + "sha256:cb34c2d66355fb70ae47b5595aafd7218e59bb9c00ad8cc3abd1406ca5874f07", + "sha256:cb3652bbe6720786b9137862205986f3ae54a09dec8499a995ed58292bdf77c2", + "sha256:cf668f26604e9f7aee9f8eaae4ca07a948168af90b96be97a4b7fa902a6d2ac1", + "sha256:d326ff80ed531bf2507cba93011c30fff2dd51454c85f55df0f59f2030b1687b", + "sha256:d6c2441538e4fadd4291c8420853431a229fcbefc1bf521810fbc2629d8ae8c2", + "sha256:d6ecfd1970b3380a569d7b3ecc5dd70dba295897418ed9e31ec3c16a5ab099a5", + "sha256:e5602a9b5074dcacc113bba4d2f011d2748f50e3201c8139ac5b68cf2a76bd8b", + "sha256:ef806f684f17dbd6263d72a54ad4073af42b42effa3eb42b877e750c24c76f86", + "sha256:f3356afbb301ec34a500b8ba8b47cba0b44ed4641c306e1dd981a08b416170b5", + "sha256:f6f7ee2289176cb1d2c59a24f50900f8b9580259fa9f1a739432242e7d254f93", + "sha256:f7e8f1ee28e0a05831c92dc1c0c1c94af5289963b7cf09eca5b5e3ce4f8c91b0", + "sha256:f8169ec628880bdbca67082a9196e2106060a4a5cbd486ac51881a4df805a36f", + "sha256:fbc88d3ba402b5d041d204ec2449c4078898f89c4a6e6f0ed1c1a510ef1e221d", + "sha256:fbd3fe37353c62fd0eb19fb76f78aa693716262bcd5f9c14bb9e5aca4b3f0dc4" + ], + "markers": "python_full_version >= '3.6.0'", + "version": "==2022.3.2" + }, "requests": { + "extras": [], "hashes": [ "sha256:bc7861137fbce630f17b03d3ad02ad0bf978c844f3536d0edda6499dafce2b6f", "sha256:d568723a7ebd25875d8d1eaf5dfa068cd2fc8194b2e483d7b1f7c81918dbec6b" @@ -799,7 +903,7 @@ "sha256:5c6bd9dc7a543b7fe4304a631f8a8a3b674e2bbfc49c2ae96200cdbe55df6b17", "sha256:95c5d300c4e879ee69708c428ba566c59478fd653cc3a22243eeb8ed846950bb" ], - "markers": "python_version >= '3.6' and python_version < '4'", + "markers": "python_version < '4' and python_full_version >= '3.6.0'", "version": "==4.8" }, "s3transfer": { @@ -853,7 +957,7 @@ "sha256:3b2503d3c7084a42b1ebd08116e5f81aadfaea95863628c80a3b774a11b7c759", "sha256:fc53893b3da2c33de295667a0e19f078c14bf86544af307354de5fcf12a3f30d" ], - "markers": "python_version >= '3.6'", + "markers": "python_full_version >= '3.6.0'", "version": "==2.3.2.post1" }, "telethon": { @@ -902,12 +1006,28 @@ "markers": "python_version >= '3.5'", "version": "==0.9.2" }, + "tzdata": { + "hashes": [ + "sha256:238e70234214138ed7b4e8a0fab0e5e13872edab3be586ab8198c407620e2ab9", + "sha256:8b536a8ec63dc0751342b3984193a3118f8fca2afe25752bb9b7fffd398552d3" + ], + "markers": "python_full_version >= '3.6.0'", + "version": "==2022.1" + }, + "tzlocal": { + "hashes": [ + "sha256:89885494684c929d9191c57aa27502afc87a579be5cdd3225c77c463ea043745", + "sha256:ee5842fa3a795f023514ac2d801c4a81d1743bbe642e3940143326b3a00addd7" + ], + "markers": "python_full_version >= '3.6.0'", + "version": "==4.2" + }, "uritemplate": { "hashes": [ "sha256:4346edfc5c3b79f694bccd6d6099a322bbeb628dbf2cd86eea55a456ce5124f0", "sha256:830c08b8d99bdd312ea4ead05994a38e8936266f84b9a7878232db50b044e02e" ], - "markers": "python_version >= '3.6'", + "markers": "python_full_version >= '3.6.0'", "version": "==4.1.1" }, "urllib3": { @@ -922,6 +1042,14 @@ "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4' and python_version < '4'", "version": "==1.26.9" }, + "vk-api": { + "hashes": [ + "sha256:11c731e214ebc7fa911db81efb021f97587493a5402b992f24748fe1cd9d7afc", + "sha256:d0ae766fa93a40d47c5da045d94201721bf766dbde122a1d2253516b35c5edf3" + ], + "index": "pypi", + "version": "==11.9.8" + }, "websockets": { "hashes": [ "sha256:07cdc0a5b2549bcfbadb585ad8471ebdc7bdf91e32e34ae3889001c1c106a6af", diff --git a/archivers/__init__.py b/archivers/__init__.py index 40fbb4b..33700d1 100644 --- a/archivers/__init__.py +++ b/archivers/__init__.py @@ -5,4 +5,5 @@ from .telethon_archiver import TelethonArchiver from .tiktok_archiver import TiktokArchiver from .wayback_archiver import WaybackArchiver from .youtubedl_archiver import YoutubeDLArchiver -from .twitter_archiver import TwitterArchiver \ No newline at end of file +from .twitter_archiver import TwitterArchiver +from .vk_archiver import VkArchiver \ No newline at end of file diff --git a/archivers/vk_archiver.py b/archivers/vk_archiver.py new file mode 100644 index 0000000..91eb0db --- /dev/null +++ b/archivers/vk_archiver.py @@ -0,0 +1,72 @@ +import re, json + +import vk_api, dateparser +from bs4 import BeautifulSoup +from loguru import logger + +from storages import Storage +from .base_archiver import Archiver, ArchiveResult +from configs import VkConfig + + +class VkArchiver(Archiver): + """" + VK videos are handled by YTDownloader, this archiver gets posts text and images. + Currently only works for /wall posts + """ + name = "vk" + wall_pattern = re.compile(r"(wall.{0,1}\d+_\d+)") + onclick_pattern = re.compile(r"({.*})") + + def __init__(self, storage: Storage, driver, config: VkConfig): + super().__init__(storage, driver) + if config != None: + self.vk_session = vk_api.VkApi(config.username, config.password) + self.vk_session.auth(token_only=True) + + def download(self, url, check_if_exists=False): + # detect URLs that this archiver can handle + has_wall = self.wall_pattern.search(url) + if has_wall: + wall_url = f'https://vk.com/{has_wall[0]}' + logger.info(f"found valid wall id from {url=} : {wall_url=}") + return self.archive_wall(wall_url, check_if_exists) + return False + + def archive_wall(self, wall_url, check_if_exists): + res = self.vk_session.http.get(wall_url).text + soup = BeautifulSoup(res, "html.parser") + image_urls = [] + time = None + try: + rel_date = soup.find("a", class_="post_link").find("span", class_="rel_date") + t = rel_date.get_text() + if "time" in rel_date.attrs: + t = rel_date["time"] + elif "abs_time" in rel_date.attrs: + t = rel_date["abs_time"] + time = dateparser.parse(t, settings={"RETURN_AS_TIMEZONE_AWARE": True, "TO_TIMEZONE": "UTC"}) + except Exception as e: + logger.warning(f"could not fetch time from post: {e}") + + post = soup.find("div", class_="wall_text") + post_text = post.find(class_="wall_post_text").get_text() + for anchor in post.find_all("a", attrs={"aria-label": "photo"}): + if img_url := self.get_image_from_anchor(anchor): + image_urls.append(img_url) + + page_cdn, page_hash, thumbnail = self.generate_media_page(image_urls, wall_url, post_text, requester=self.vk_session.http) + screenshot = self.get_screenshot(wall_url) + return ArchiveResult(status="success", cdn_url=page_cdn, screenshot=screenshot, hash=page_hash, thumbnail=thumbnail, timestamp=time) + + def get_image_from_anchor(self, anchor): + try: + # get anchor.onlick text, retrieve the JSON value there + # retrieve "temp"."z" which contains the image with more quality + temp_json = json.loads(self.onclick_pattern.search(anchor["onclick"])[0])["temp"] + for quality in ["z", "y", "x"]: # decreasing quality + if quality in temp_json: + return temp_json[quality] + except Exception as e: + logger.warning(f"failed to get image from vk wall anchor: {e}") + return False diff --git a/auto_archive.py b/auto_archive.py index 8c5643a..2b8a9e9 100644 --- a/auto_archive.py +++ b/auto_archive.py @@ -3,7 +3,7 @@ import os, datetime, shutil, traceback, random from loguru import logger from slugify import slugify -from archivers import TelethonArchiver, TelegramArchiver, TiktokArchiver, YoutubeDLArchiver, TwitterArchiver, WaybackArchiver, ArchiveResult, Archiver +from archivers import TelethonArchiver, TelegramArchiver, TiktokArchiver, YoutubeDLArchiver, TwitterArchiver, VkArchiver, WaybackArchiver, ArchiveResult, Archiver from utils import GWorksheet, mkdir_if_not_exists, expand_url from configs import Config from storages import Storage @@ -95,6 +95,7 @@ def process_sheet(c: Config): YoutubeDLArchiver(storage, c.webdriver, c.facebook_cookie), TelegramArchiver(storage, c.webdriver), TwitterArchiver(storage, c.webdriver), + VkArchiver(storage, c.webdriver, c.vk_config), WaybackArchiver(storage, c.webdriver, c.wayback_config) ] diff --git a/configs/__init__.py b/configs/__init__.py index 5a693ca..f70c9c6 100644 --- a/configs/__init__.py +++ b/configs/__init__.py @@ -1,4 +1,5 @@ from .config import Config from .selenium_config import SeleniumConfig from .telethon_config import TelethonConfig -from .wayback_config import WaybackConfig \ No newline at end of file +from .wayback_config import WaybackConfig +from .vk_config import VkConfig \ No newline at end of file diff --git a/configs/config.py b/configs/config.py index dfe786c..3cef93a 100644 --- a/configs/config.py +++ b/configs/config.py @@ -9,6 +9,7 @@ from utils import GWorksheet, getattr_or from .wayback_config import WaybackConfig from .telethon_config import TelethonConfig from .selenium_config import SeleniumConfig +from .vk_config import VkConfig from storages import Storage, S3Config, S3Storage, GDStorage, GDConfig, LocalStorage, LocalConfig @@ -120,6 +121,7 @@ class Config: secret=secrets["wayback"]["secret"], ) else: + self.wayback_config = None logger.debug(f"'wayback' key not present in the {self.config_file=}") # telethon config @@ -130,8 +132,19 @@ class Config: bot_token=secrets["telegram"].get("bot_token", None) ) else: + self.telegram_config = None logger.debug(f"'telegram' key not present in the {self.config_file=}") + # vk config + if "vk" in secrets: + self.vk_config = VkConfig( + username=secrets["vk"]["username"], + password=secrets["vk"]["password"] + ) + else: + self.vk_config = None + logger.debug(f"'vk' key not present in the {self.config_file=}") + del self.config["secrets"] # delete to prevent leaks def set_log_files(self): @@ -225,6 +238,7 @@ class Config: "local_config": hasattr(self, "local_config"), "wayback_config": self.wayback_config != None, "telegram_config": self.telegram_config != None, + "vk_config": self.vk_config != None, "gsheets_client": self.gsheets_client != None, "column_names": self.column_names, }, ensure_ascii=False, indent=4) diff --git a/configs/vk_config.py b/configs/vk_config.py new file mode 100644 index 0000000..db2e61c --- /dev/null +++ b/configs/vk_config.py @@ -0,0 +1,8 @@ + +from dataclasses import dataclass + + +@dataclass +class VkConfig: + username: str + password: str From 951b16ba9c19053836973709eea9bacab00e7fb3 Mon Sep 17 00:00:00 2001 From: msramalho <19508417+msramalho@users.noreply.github.com> Date: Wed, 15 Jun 2022 16:38:30 +0200 Subject: [PATCH 63/84] improving media page with images and videos --- archivers/base_archiver.py | 26 ++++++++++++++++++++++---- 1 file changed, 22 insertions(+), 4 deletions(-) diff --git a/archivers/base_archiver.py b/archivers/base_archiver.py index 18e4c1b..eb508c0 100644 --- a/archivers/base_archiver.py +++ b/archivers/base_archiver.py @@ -1,4 +1,4 @@ -import os, datetime, shutil, hashlib, time, requests, re +import os, datetime, shutil, hashlib, time, requests, re, mimetypes from dataclasses import dataclass from abc import ABC, abstractmethod from urllib.parse import urlparse @@ -58,7 +58,13 @@ class Archiver(ABC):

{url}

{self.name} object data:

{object}" page += f"" @@ -77,8 +83,20 @@ class Archiver(ABC): page_cdn = self.storage.get_cdn_url(page_key) return (page_cdn, page_hash, thumbnail) + def _guess_file_type(self, path:str): + """ + Receives a URL or filename and returns global mimetype like 'image' or 'video' and the specific mimetype as a tuple + see https://developer.mozilla.org/en-US/docs/Web/HTTP/Basics_of_HTTP/MIME_types/Common_types + ex: ('audio', 'audio/mp3') + """ + mime = mimetypes.guess_type(path)[0] + if mime is not None: + return mime.split("/")[0], mime + return "", "" + + # eg images in a tweet save to cloud storage - def generate_media_page(self, urls, url, object): + def generate_media_page(self, urls, url, object, requester=requests): """ For a list of media urls, fetch them, upload them and call self.generate_media_page_html with them @@ -94,7 +112,7 @@ class Archiver(ABC): filename = os.path.join(Storage.TMP_FOLDER, key) - d = requests.get(media_url, headers=headers) + d = requester.get(media_url, headers=headers) with open(filename, 'wb') as f: f.write(d.content) From 771c5376c4b9706c315bd7efb69bde6f4f19f170 Mon Sep 17 00:00:00 2001 From: msramalho <19508417+msramalho@users.noreply.github.com> Date: Wed, 15 Jun 2022 16:47:20 +0200 Subject: [PATCH 64/84] simplify display --- archivers/base_archiver.py | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/archivers/base_archiver.py b/archivers/base_archiver.py index eb508c0..76d6267 100644 --- a/archivers/base_archiver.py +++ b/archivers/base_archiver.py @@ -58,12 +58,12 @@ class Archiver(ABC):

{url}

{self.name} object data:

{object}" @@ -83,19 +83,18 @@ class Archiver(ABC): page_cdn = self.storage.get_cdn_url(page_key) return (page_cdn, page_hash, thumbnail) - def _guess_file_type(self, path:str): + def _guess_file_type(self, path: str): """ - Receives a URL or filename and returns global mimetype like 'image' or 'video' and the specific mimetype as a tuple + Receives a URL or filename and returns global mimetype like 'image' or 'video' see https://developer.mozilla.org/en-US/docs/Web/HTTP/Basics_of_HTTP/MIME_types/Common_types - ex: ('audio', 'audio/mp3') """ mime = mimetypes.guess_type(path)[0] if mime is not None: - return mime.split("/")[0], mime - return "", "" - + return mime.split("/")[0] + return "" # eg images in a tweet save to cloud storage + def generate_media_page(self, urls, url, object, requester=requests): """ For a list of media urls, fetch them, upload them From 2dbdf9b8d3e68bc53853c97143c8be370fb4d9a3 Mon Sep 17 00:00:00 2001 From: msramalho <19508417+msramalho@users.noreply.github.com> Date: Wed, 15 Jun 2022 17:04:50 +0200 Subject: [PATCH 65/84] check if exists --- archivers/base_archiver.py | 1 - archivers/vk_archiver.py | 17 +++++++++++++---- 2 files changed, 13 insertions(+), 5 deletions(-) diff --git a/archivers/base_archiver.py b/archivers/base_archiver.py index 76d6267..2ba7c4d 100644 --- a/archivers/base_archiver.py +++ b/archivers/base_archiver.py @@ -225,7 +225,6 @@ class Archiver(ABC): key = key_folder + fname self.storage.upload(thumbnail_filename, key) - cdn_url = self.storage.get_cdn_url(key) cdn_urls.append(cdn_url) diff --git a/archivers/vk_archiver.py b/archivers/vk_archiver.py index 91eb0db..b9d3f6c 100644 --- a/archivers/vk_archiver.py +++ b/archivers/vk_archiver.py @@ -28,12 +28,21 @@ class VkArchiver(Archiver): # detect URLs that this archiver can handle has_wall = self.wall_pattern.search(url) if has_wall: - wall_url = f'https://vk.com/{has_wall[0]}' - logger.info(f"found valid wall id from {url=} : {wall_url=}") - return self.archive_wall(wall_url, check_if_exists) + wall_id = has_wall[0] + wall_url = f'https://vk.com/{wall_id}' + logger.info(f"found valid wall id from {url=} : {wall_id=}") + key = self.get_html_key(wall_url) + + # if check if exists will not download again + if check_if_exists and self.storage.exists(key): + screenshot = self.get_screenshot(wall_url) + cdn_url = self.storage.get_cdn_url(key) + return ArchiveResult(status="already archived", cdn_url=cdn_url, screenshot=screenshot) + + return self.archive_wall(wall_url) return False - def archive_wall(self, wall_url, check_if_exists): + def archive_wall(self, wall_url): res = self.vk_session.http.get(wall_url).text soup = BeautifulSoup(res, "html.parser") image_urls = [] From 5cc21fa4e05ffca94d0f323736d3d6493deed658 Mon Sep 17 00:00:00 2001 From: msramalho <19508417+msramalho@users.noreply.github.com> Date: Wed, 15 Jun 2022 17:04:56 +0200 Subject: [PATCH 66/84] bug fix --- archivers/telegram_archiver.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/archivers/telegram_archiver.py b/archivers/telegram_archiver.py index 22de30e..0b6e777 100644 --- a/archivers/telegram_archiver.py +++ b/archivers/telegram_archiver.py @@ -53,7 +53,6 @@ class TelegramArchiver(Archiver): key = self.get_key(video_id) filename = os.path.join(Storage.TMP_FOLDER, key) - cdn_url = self.storage.get_cdn_url(key) if check_if_exists and self.storage.exists(key): status = 'already archived' @@ -84,5 +83,6 @@ class TelegramArchiver(Archiver): filename, key, duration=duration) os.remove(filename) + cdn_url = self.storage.get_cdn_url(key) return ArchiveResult(status=status, cdn_url=cdn_url, thumbnail=key_thumb, thumbnail_index=thumb_index, duration=duration, title=original_url, timestamp=s.find_all('time')[1].get('datetime'), hash=hash, screenshot=screenshot) From b1f70bb81890eeb7d8502e5273144cecdc92ed79 Mon Sep 17 00:00:00 2001 From: msramalho <19508417+msramalho@users.noreply.github.com> Date: Wed, 15 Jun 2022 17:14:08 +0200 Subject: [PATCH 67/84] minor improvements --- archivers/wayback_archiver.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/archivers/wayback_archiver.py b/archivers/wayback_archiver.py index 81c1644..f46d1cb 100644 --- a/archivers/wayback_archiver.py +++ b/archivers/wayback_archiver.py @@ -18,10 +18,12 @@ class WaybackArchiver(Archiver): def __init__(self, storage: Storage, driver, config: WaybackConfig): super(WaybackArchiver, self).__init__(storage, driver) self.config = config - # TODO: this logic should live at the auto-archiver level self.seen_urls = {} def download(self, url, check_if_exists=False): + if self.config is None: + logger.error('Missing Wayback config') + return False if check_if_exists: if url in self.seen_urls: return self.seen_urls[url] @@ -57,7 +59,7 @@ class WaybackArchiver(Archiver): retries += 1 if status_r.status_code != 200: - return ArchiveResult(status="Internet archive failed", screenshot=screenshot) + return ArchiveResult(status=f"Internet archive failed: check https://web.archive.org/save/status/{job_id}", screenshot=screenshot) status_json = status_r.json() if status_json['status'] != 'success': From 86e1d3545ef85d616bbe51be5141337aa6277062 Mon Sep 17 00:00:00 2001 From: msramalho <19508417+msramalho@users.noreply.github.com> Date: Wed, 15 Jun 2022 17:17:46 +0200 Subject: [PATCH 68/84] fix for missing telethon config --- archivers/telethon_archiver.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/archivers/telethon_archiver.py b/archivers/telethon_archiver.py index 18996d8..bfc7f57 100644 --- a/archivers/telethon_archiver.py +++ b/archivers/telethon_archiver.py @@ -16,8 +16,9 @@ class TelethonArchiver(Archiver): def __init__(self, storage: Storage, driver, config: TelethonConfig): super().__init__(storage, driver) - self.client = TelegramClient("./anon", config.api_id, config.api_hash) - self.bot_token = config.bot_token + if config: + self.client = TelegramClient("./anon", config.api_id, config.api_hash) + self.bot_token = config.bot_token def _get_media_posts_in_group(self, chat, original_post, max_amp=10): """ @@ -38,6 +39,10 @@ class TelethonArchiver(Archiver): return media def download(self, url, check_if_exists=False): + if not hasattr(self, "client"): + logger.error('Missing Telethon config') + return False + # detect URLs that we definitely cannot handle matches = self.link_pattern.findall(url) if not len(matches): From 2f02336403dd186d6041682b237b82a75902838d Mon Sep 17 00:00:00 2001 From: msramalho <19508417+msramalho@users.noreply.github.com> Date: Wed, 15 Jun 2022 17:18:47 +0200 Subject: [PATCH 69/84] example config --- example.config.yaml | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/example.config.yaml b/example.config.yaml index 0c568c2..9026ad4 100644 --- a/example.config.yaml +++ b/example.config.yaml @@ -39,6 +39,11 @@ secrets: # optional, but allows access to more content such as large videos, talk to @botfather bot_token: your bot-token + # vkontakte (vk.com) credentials + vk: + username: "phone number or email" + password: "password" + google_sheets: # local filename: defaults to service_account.json, see https://gspread.readthedocs.io/en/latest/oauth2.html#for-bots-using-service-account service_account: "service_account.json" From c08b5268f730ead8df42ee32ddc9f13f99d058dc Mon Sep 17 00:00:00 2001 From: msramalho <19508417+msramalho@users.noreply.github.com> Date: Wed, 15 Jun 2022 21:25:15 +0200 Subject: [PATCH 70/84] using API instead of scraping --- archivers/base_archiver.py | 4 +- archivers/vk_archiver.py | 85 ++++++++++++++++++-------------------- 2 files changed, 42 insertions(+), 47 deletions(-) diff --git a/archivers/base_archiver.py b/archivers/base_archiver.py index 2ba7c4d..0f490fe 100644 --- a/archivers/base_archiver.py +++ b/archivers/base_archiver.py @@ -95,7 +95,7 @@ class Archiver(ABC): # eg images in a tweet save to cloud storage - def generate_media_page(self, urls, url, object, requester=requests): + def generate_media_page(self, urls, url, object): """ For a list of media urls, fetch them, upload them and call self.generate_media_page_html with them @@ -111,7 +111,7 @@ class Archiver(ABC): filename = os.path.join(Storage.TMP_FOLDER, key) - d = requester.get(media_url, headers=headers) + d = requests.get(media_url, headers=headers) with open(filename, 'wb') as f: f.write(d.content) diff --git a/archivers/vk_archiver.py b/archivers/vk_archiver.py index b9d3f6c..e37c7a5 100644 --- a/archivers/vk_archiver.py +++ b/archivers/vk_archiver.py @@ -1,4 +1,4 @@ -import re, json +import re, json, requests import vk_api, dateparser from bs4 import BeautifulSoup @@ -16,6 +16,7 @@ class VkArchiver(Archiver): """ name = "vk" wall_pattern = re.compile(r"(wall.{0,1}\d+_\d+)") + photo_pattern = re.compile(r"(photo.{0,1}\d+_\d+)") onclick_pattern = re.compile(r"({.*})") def __init__(self, storage: Storage, driver, config: VkConfig): @@ -27,55 +28,49 @@ class VkArchiver(Archiver): def download(self, url, check_if_exists=False): # detect URLs that this archiver can handle has_wall = self.wall_pattern.search(url) + has_photo = self.photo_pattern.search(url) + _id, method = None, None if has_wall: - wall_id = has_wall[0] - wall_url = f'https://vk.com/{wall_id}' - logger.info(f"found valid wall id from {url=} : {wall_id=}") - key = self.get_html_key(wall_url) + _id = has_wall[0] + method = self.archive_wall + elif has_photo: + _id = has_photo[0] + method = self.archive_photo + else: return False - # if check if exists will not download again - if check_if_exists and self.storage.exists(key): - screenshot = self.get_screenshot(wall_url) - cdn_url = self.storage.get_cdn_url(key) - return ArchiveResult(status="already archived", cdn_url=cdn_url, screenshot=screenshot) + logger.info(f"found valid {_id=} from {url=}") + proper_url = f'https://vk.com/{_id}' - return self.archive_wall(wall_url) - return False + # if check if exists will not download again + key = self.get_html_key(proper_url) + if check_if_exists and self.storage.exists(key): + screenshot = self.get_screenshot(proper_url) + cdn_url = self.storage.get_cdn_url(key) + return ArchiveResult(status="already archived", cdn_url=cdn_url, screenshot=screenshot) - def archive_wall(self, wall_url): - res = self.vk_session.http.get(wall_url).text - soup = BeautifulSoup(res, "html.parser") - image_urls = [] - time = None - try: - rel_date = soup.find("a", class_="post_link").find("span", class_="rel_date") - t = rel_date.get_text() - if "time" in rel_date.attrs: - t = rel_date["time"] - elif "abs_time" in rel_date.attrs: - t = rel_date["abs_time"] - time = dateparser.parse(t, settings={"RETURN_AS_TIMEZONE_AWARE": True, "TO_TIMEZONE": "UTC"}) - except Exception as e: - logger.warning(f"could not fetch time from post: {e}") + return method(proper_url, _id) - post = soup.find("div", class_="wall_text") - post_text = post.find(class_="wall_post_text").get_text() - for anchor in post.find_all("a", attrs={"aria-label": "photo"}): - if img_url := self.get_image_from_anchor(anchor): - image_urls.append(img_url) + def archive_photo(self, photo_url, photo_id): + headers = {"access_token": self.vk_session.token["access_token"], "photos": photo_id.replace("photo", ""), "extended": "1", "v": self.vk_session.api_version} + req = requests.get("https://api.vk.com/method/photos.getById", headers) + res = req.json()["response"][0] + img_url = res["orig_photo"]["url"] + time = dateparser.parse(str(res["date"]), settings={"RETURN_AS_TIMEZONE_AWARE": True, "TO_TIMEZONE": "UTC"}) - page_cdn, page_hash, thumbnail = self.generate_media_page(image_urls, wall_url, post_text, requester=self.vk_session.http) - screenshot = self.get_screenshot(wall_url) + page_cdn, page_hash, thumbnail = self.generate_media_page([img_url], photo_url, res) + screenshot = self.get_screenshot(photo_url) return ArchiveResult(status="success", cdn_url=page_cdn, screenshot=screenshot, hash=page_hash, thumbnail=thumbnail, timestamp=time) - def get_image_from_anchor(self, anchor): - try: - # get anchor.onlick text, retrieve the JSON value there - # retrieve "temp"."z" which contains the image with more quality - temp_json = json.loads(self.onclick_pattern.search(anchor["onclick"])[0])["temp"] - for quality in ["z", "y", "x"]: # decreasing quality - if quality in temp_json: - return temp_json[quality] - except Exception as e: - logger.warning(f"failed to get image from vk wall anchor: {e}") - return False + def archive_wall(self, wall_url, wall_id): + headers = {"access_token": self.vk_session.token["access_token"], "posts": wall_id.replace("wall", ""), "extended": "1", "copy_history_depth": "2", "v": self.vk_session.api_version} + req = requests.get("https://api.vk.com/method/wall.getById", headers) + res = req.json()["response"] + wall = res["items"][0] + img_urls = [p[p["type"]]["sizes"][-1]["url"] for p in wall["attachments"]] if "attachments" in wall else [] + title = wall["text"][0:200] # more on the page + time = dateparser.parse(str(wall["date"]), settings={"RETURN_AS_TIMEZONE_AWARE": True, "TO_TIMEZONE": "UTC"}) + + page_cdn, page_hash, thumbnail = self.generate_media_page(img_urls, wall_url, res) + screenshot = self.get_screenshot(wall_url) + return ArchiveResult(status="success", cdn_url=page_cdn, screenshot=screenshot, hash=page_hash, thumbnail=thumbnail, timestamp=time, title=title) + From ed4b193ae7209f723f774f0d78b89c3e87d289eb Mon Sep 17 00:00:00 2001 From: msramalho <19508417+msramalho@users.noreply.github.com> Date: Wed, 15 Jun 2022 22:30:08 +0200 Subject: [PATCH 71/84] walrus --- archivers/vk_archiver.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/archivers/vk_archiver.py b/archivers/vk_archiver.py index e37c7a5..d62a2a5 100644 --- a/archivers/vk_archiver.py +++ b/archivers/vk_archiver.py @@ -27,13 +27,11 @@ class VkArchiver(Archiver): def download(self, url, check_if_exists=False): # detect URLs that this archiver can handle - has_wall = self.wall_pattern.search(url) - has_photo = self.photo_pattern.search(url) _id, method = None, None - if has_wall: + if has_wall := self.wall_pattern.search(url): _id = has_wall[0] method = self.archive_wall - elif has_photo: + elif has_photo := self.photo_pattern.search(url): _id = has_photo[0] method = self.archive_photo else: return False From 3b6678818e053e47ac5e450faa5e8361aa893af4 Mon Sep 17 00:00:00 2001 From: msramalho <19508417+msramalho@users.noreply.github.com> Date: Wed, 15 Jun 2022 22:47:55 +0200 Subject: [PATCH 72/84] title for vk photo --- archivers/vk_archiver.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/archivers/vk_archiver.py b/archivers/vk_archiver.py index d62a2a5..e9c999f 100644 --- a/archivers/vk_archiver.py +++ b/archivers/vk_archiver.py @@ -52,12 +52,13 @@ class VkArchiver(Archiver): headers = {"access_token": self.vk_session.token["access_token"], "photos": photo_id.replace("photo", ""), "extended": "1", "v": self.vk_session.api_version} req = requests.get("https://api.vk.com/method/photos.getById", headers) res = req.json()["response"][0] + title = res["text"][:200] # more on the page img_url = res["orig_photo"]["url"] time = dateparser.parse(str(res["date"]), settings={"RETURN_AS_TIMEZONE_AWARE": True, "TO_TIMEZONE": "UTC"}) page_cdn, page_hash, thumbnail = self.generate_media_page([img_url], photo_url, res) screenshot = self.get_screenshot(photo_url) - return ArchiveResult(status="success", cdn_url=page_cdn, screenshot=screenshot, hash=page_hash, thumbnail=thumbnail, timestamp=time) + return ArchiveResult(status="success", cdn_url=page_cdn, screenshot=screenshot, hash=page_hash, thumbnail=thumbnail, timestamp=time, title=title) def archive_wall(self, wall_url, wall_id): headers = {"access_token": self.vk_session.token["access_token"], "posts": wall_id.replace("wall", ""), "extended": "1", "copy_history_depth": "2", "v": self.vk_session.api_version} @@ -65,7 +66,7 @@ class VkArchiver(Archiver): res = req.json()["response"] wall = res["items"][0] img_urls = [p[p["type"]]["sizes"][-1]["url"] for p in wall["attachments"]] if "attachments" in wall else [] - title = wall["text"][0:200] # more on the page + title = wall["text"][:200] # more on the page time = dateparser.parse(str(wall["date"]), settings={"RETURN_AS_TIMEZONE_AWARE": True, "TO_TIMEZONE": "UTC"}) page_cdn, page_hash, thumbnail = self.generate_media_page(img_urls, wall_url, res) From 659097c07213d107b63e24c4e77b3776c4aaf71f Mon Sep 17 00:00:00 2001 From: msramalho <19508417+msramalho@users.noreply.github.com> Date: Wed, 15 Jun 2022 22:54:18 +0200 Subject: [PATCH 73/84] better error log --- archivers/vk_archiver.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/archivers/vk_archiver.py b/archivers/vk_archiver.py index e9c999f..ee43e8e 100644 --- a/archivers/vk_archiver.py +++ b/archivers/vk_archiver.py @@ -46,13 +46,17 @@ class VkArchiver(Archiver): cdn_url = self.storage.get_cdn_url(key) return ArchiveResult(status="already archived", cdn_url=cdn_url, screenshot=screenshot) - return method(proper_url, _id) + try: + return method(proper_url, _id) + except Exception as e: + logger.error(f"something went wrong with vk archive, possibly 404 causing index out of range, or missing key: {e}") + return False def archive_photo(self, photo_url, photo_id): headers = {"access_token": self.vk_session.token["access_token"], "photos": photo_id.replace("photo", ""), "extended": "1", "v": self.vk_session.api_version} req = requests.get("https://api.vk.com/method/photos.getById", headers) res = req.json()["response"][0] - title = res["text"][:200] # more on the page + title = res["text"][:200] # more on the page img_url = res["orig_photo"]["url"] time = dateparser.parse(str(res["date"]), settings={"RETURN_AS_TIMEZONE_AWARE": True, "TO_TIMEZONE": "UTC"}) @@ -66,10 +70,9 @@ class VkArchiver(Archiver): res = req.json()["response"] wall = res["items"][0] img_urls = [p[p["type"]]["sizes"][-1]["url"] for p in wall["attachments"]] if "attachments" in wall else [] - title = wall["text"][:200] # more on the page + title = wall["text"][:200] # more on the page time = dateparser.parse(str(wall["date"]), settings={"RETURN_AS_TIMEZONE_AWARE": True, "TO_TIMEZONE": "UTC"}) page_cdn, page_hash, thumbnail = self.generate_media_page(img_urls, wall_url, res) screenshot = self.get_screenshot(wall_url) return ArchiveResult(status="success", cdn_url=page_cdn, screenshot=screenshot, hash=page_hash, thumbnail=thumbnail, timestamp=time, title=title) - From 08f48ae351d94c6d46f2a7eb6b3d43544bf2595a Mon Sep 17 00:00:00 2001 From: msramalho <19508417+msramalho@users.noreply.github.com> Date: Wed, 15 Jun 2022 23:17:32 +0200 Subject: [PATCH 74/84] handling selenium better --- configs/config.py | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/configs/config.py b/configs/config.py index 3cef93a..78a5090 100644 --- a/configs/config.py +++ b/configs/config.py @@ -4,6 +4,7 @@ import gspread from loguru import logger from selenium import webdriver from dataclasses import asdict +from selenium.common.exceptions import TimeoutException from utils import GWorksheet, getattr_or from .wayback_config import WaybackConfig @@ -210,16 +211,23 @@ class Config: def destroy_webdriver(self): if self.webdriver is not None and type(self.webdriver) != str: self.webdriver.quit() + del self.webdriver def recreate_webdriver(self): - self.destroy_webdriver() options = webdriver.FirefoxOptions() options.headless = True options.set_preference('network.protocol-handler.external.tg', False) - self.webdriver = webdriver.Firefox(options=options) - self.webdriver.set_window_size(self.selenium_config.window_width, + try: + new_webdriver = webdriver.Firefox(options=options) + # only destroy if creation is successful + self.destroy_webdriver() + self.webdriver = new_webdriver + self.webdriver.set_window_size(self.selenium_config.window_width, self.selenium_config.window_height) - self.webdriver.set_page_load_timeout(self.selenium_config.timeout_seconds) + self.webdriver.set_page_load_timeout(self.selenium_config.timeout_seconds) + except TimeoutException as e: + logger.error(f"failed to get new webdriver, possibly due to insufficient system resources or timeout settings: {e}") + def __str__(self) -> str: return json.dumps({ From c6bcb5900562074e75c0d2bd7f0aeaba804b4823 Mon Sep 17 00:00:00 2001 From: msramalho <19508417+msramalho@users.noreply.github.com> Date: Wed, 15 Jun 2022 23:36:10 +0200 Subject: [PATCH 75/84] improvement for albums --- archivers/vk_archiver.py | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/archivers/vk_archiver.py b/archivers/vk_archiver.py index ee43e8e..e48e9ef 100644 --- a/archivers/vk_archiver.py +++ b/archivers/vk_archiver.py @@ -69,7 +69,18 @@ class VkArchiver(Archiver): req = requests.get("https://api.vk.com/method/wall.getById", headers) res = req.json()["response"] wall = res["items"][0] - img_urls = [p[p["type"]]["sizes"][-1]["url"] for p in wall["attachments"]] if "attachments" in wall else [] + img_urls = [] + if "attachments" in wall: + for a in wall["attachments"]: + attachment = a[a["type"]] + if "thumb" in attachment: + attachment = attachment["thumb"] + if "sizes" in attachment: + try: img_urls.append(attachment["sizes"][-1]["url"]) + except Exception as e: + logger.warning(f"could not get image from attachment: {e}") + + title = wall["text"][:200] # more on the page time = dateparser.parse(str(wall["date"]), settings={"RETURN_AS_TIMEZONE_AWARE": True, "TO_TIMEZONE": "UTC"}) From 2ac08a34f633800c073f82bd3f5eedc491456d8d Mon Sep 17 00:00:00 2001 From: msramalho <19508417+msramalho@users.noreply.github.com> Date: Thu, 16 Jun 2022 13:45:02 +0200 Subject: [PATCH 76/84] ydl timestamp bug fix --- archivers/youtubedl_archiver.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/archivers/youtubedl_archiver.py b/archivers/youtubedl_archiver.py index be3477d..7990131 100644 --- a/archivers/youtubedl_archiver.py +++ b/archivers/youtubedl_archiver.py @@ -106,11 +106,11 @@ class YoutubeDLArchiver(Archiver): os.remove(filename) - timestamp = datetime.datetime.utcfromtimestamp(info['timestamp']).replace(tzinfo=datetime.timezone.utc).isoformat() \ - if 'timestamp' in info else \ - datetime.datetime.strptime(info['upload_date'], '%Y%m%d').replace(tzinfo=datetime.timezone.utc) \ - if 'upload_date' in info and info['upload_date'] is not None else \ - None + timestamp = None + if 'timestamp' in info and info['timestamp'] is not None: + timestamp = datetime.datetime.utcfromtimestamp(info['timestamp']).replace(tzinfo=datetime.timezone.utc).isoformat() + elif 'upload_date' in info and info['upload_date'] is not None: + timestamp = datetime.datetime.strptime(info['upload_date'], '%Y%m%d').replace(tzinfo=datetime.timezone.utc) return ArchiveResult(status=status, cdn_url=cdn_url, thumbnail=key_thumb, thumbnail_index=thumb_index, duration=duration, title=info['title'] if 'title' in info else None, timestamp=timestamp, hash=hash, screenshot=screenshot) From 277d81d687ddfbf14ba56defa5a63555df130e40 Mon Sep 17 00:00:00 2001 From: msramalho <19508417+msramalho@users.noreply.github.com> Date: Thu, 16 Jun 2022 14:16:18 +0200 Subject: [PATCH 77/84] telethon minor fix --- archivers/telethon_archiver.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/archivers/telethon_archiver.py b/archivers/telethon_archiver.py index bfc7f57..d36c762 100644 --- a/archivers/telethon_archiver.py +++ b/archivers/telethon_archiver.py @@ -27,7 +27,7 @@ class TelethonArchiver(Archiver): of `max_amp` both ways Returns a list of [post] where each post has media and is in the same grouped_id """ - if original_post.grouped_id is None: + if not hasattr(original_post, "grouped_id") or original_post.grouped_id is None: return [original_post] if original_post.media is not None else [] search_ids = [i for i in range(original_post.id - max_amp, original_post.id + max_amp + 1)] From b37f7adc8fb082dfda1e05422f595a08e84dea8b Mon Sep 17 00:00:00 2001 From: msramalho <19508417+msramalho@users.noreply.github.com> Date: Thu, 16 Jun 2022 14:29:51 +0200 Subject: [PATCH 78/84] another telethon fix --- archivers/telethon_archiver.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/archivers/telethon_archiver.py b/archivers/telethon_archiver.py index d36c762..bbea956 100644 --- a/archivers/telethon_archiver.py +++ b/archivers/telethon_archiver.py @@ -28,7 +28,7 @@ class TelethonArchiver(Archiver): Returns a list of [post] where each post has media and is in the same grouped_id """ if not hasattr(original_post, "grouped_id") or original_post.grouped_id is None: - return [original_post] if original_post.media is not None else [] + return [original_post] if hasattr(original_post, "media") and original_post.media is not None else [] search_ids = [i for i in range(original_post.id - max_amp, original_post.id + max_amp + 1)] posts = self.client.get_messages(chat, ids=search_ids) From ec1993c5dc812b3fbfebf75793b07bc192cb4018 Mon Sep 17 00:00:00 2001 From: msramalho <19508417+msramalho@users.noreply.github.com> Date: Thu, 16 Jun 2022 14:33:50 +0200 Subject: [PATCH 79/84] telethon fix --- archivers/telethon_archiver.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/archivers/telethon_archiver.py b/archivers/telethon_archiver.py index bbea956..ad1fadb 100644 --- a/archivers/telethon_archiver.py +++ b/archivers/telethon_archiver.py @@ -8,6 +8,7 @@ from telethon.errors import ChannelInvalidError from storages import Storage from .base_archiver import Archiver, ArchiveResult from configs import TelethonConfig +from utils import getattr_or class TelethonArchiver(Archiver): @@ -27,8 +28,8 @@ class TelethonArchiver(Archiver): of `max_amp` both ways Returns a list of [post] where each post has media and is in the same grouped_id """ - if not hasattr(original_post, "grouped_id") or original_post.grouped_id is None: - return [original_post] if hasattr(original_post, "media") and original_post.media is not None else [] + if getattr_or(original_post, "grouped_id") is not None: + return [original_post] if getattr_or(original_post, "media") is not None else [] search_ids = [i for i in range(original_post.id - max_amp, original_post.id + max_amp + 1)] posts = self.client.get_messages(chat, ids=search_ids) @@ -110,4 +111,4 @@ class TelethonArchiver(Archiver): return ArchiveResult(status=status, cdn_url=cdn_url, title=post.message, thumbnail=key_thumb, thumbnail_index=thumb_index, timestamp=post.date, hash=hash, screenshot=screenshot) page_cdn, page_hash, _ = self.generate_media_page_html(url, [], html.escape(str(post))) - return ArchiveResult(status=status, cdn_url=page_cdn, title=post.message, timestamp=post.date, hash=page_hash, screenshot=screenshot) + return ArchiveResult(status=status, cdn_url=page_cdn, title=getattr_or(post, "message", ""), timestamp=getattr_or(post, "date"), hash=page_hash, screenshot=screenshot) From 81ce27bdb3030755ca1ff12ba8d4169715b68fcd Mon Sep 17 00:00:00 2001 From: msramalho <19508417+msramalho@users.noreply.github.com> Date: Thu, 16 Jun 2022 14:34:33 +0200 Subject: [PATCH 80/84] fix --- archivers/telethon_archiver.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/archivers/telethon_archiver.py b/archivers/telethon_archiver.py index ad1fadb..6407f09 100644 --- a/archivers/telethon_archiver.py +++ b/archivers/telethon_archiver.py @@ -28,7 +28,7 @@ class TelethonArchiver(Archiver): of `max_amp` both ways Returns a list of [post] where each post has media and is in the same grouped_id """ - if getattr_or(original_post, "grouped_id") is not None: + if getattr_or(original_post, "grouped_id") is None: return [original_post] if getattr_or(original_post, "media") is not None else [] search_ids = [i for i in range(original_post.id - max_amp, original_post.id + max_amp + 1)] From 81eb00a76771526390785e00cf148601dfcf2845 Mon Sep 17 00:00:00 2001 From: msramalho <19508417+msramalho@users.noreply.github.com> Date: Thu, 16 Jun 2022 16:19:57 +0200 Subject: [PATCH 81/84] handle deleted telegram --- archivers/telethon_archiver.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/archivers/telethon_archiver.py b/archivers/telethon_archiver.py index 6407f09..efef1cc 100644 --- a/archivers/telethon_archiver.py +++ b/archivers/telethon_archiver.py @@ -29,7 +29,7 @@ class TelethonArchiver(Archiver): Returns a list of [post] where each post has media and is in the same grouped_id """ if getattr_or(original_post, "grouped_id") is None: - return [original_post] if getattr_or(original_post, "media") is not None else [] + return [original_post] if getattr_or(original_post, "media") else [] search_ids = [i for i in range(original_post.id - max_amp, original_post.id + max_amp + 1)] posts = self.client.get_messages(chat, ids=search_ids) @@ -67,6 +67,8 @@ class TelethonArchiver(Archiver): logger.error(f"Could not fetch telegram {url}. This error can be fixed if you setup a bot_token in addition to api_id and api_hash: {e}") return False + if post is None: return False + media_posts = self._get_media_posts_in_group(chat, post) logger.debug(f'got {len(media_posts)=} for {url=}') @@ -111,4 +113,4 @@ class TelethonArchiver(Archiver): return ArchiveResult(status=status, cdn_url=cdn_url, title=post.message, thumbnail=key_thumb, thumbnail_index=thumb_index, timestamp=post.date, hash=hash, screenshot=screenshot) page_cdn, page_hash, _ = self.generate_media_page_html(url, [], html.escape(str(post))) - return ArchiveResult(status=status, cdn_url=page_cdn, title=getattr_or(post, "message", ""), timestamp=getattr_or(post, "date"), hash=page_hash, screenshot=screenshot) + return ArchiveResult(status=status, cdn_url=page_cdn, title=post.message, timestamp=getattr_or(post, "date"), hash=page_hash, screenshot=screenshot) From afc7e133cf4fb12e4b7ce1e06b3adebf7c437cd0 Mon Sep 17 00:00:00 2001 From: msramalho <19508417+msramalho@users.noreply.github.com> Date: Thu, 16 Jun 2022 16:26:30 +0200 Subject: [PATCH 82/84] simplifying telethon --- archivers/telethon_archiver.py | 19 +++++-------------- 1 file changed, 5 insertions(+), 14 deletions(-) diff --git a/archivers/telethon_archiver.py b/archivers/telethon_archiver.py index efef1cc..2f4de02 100644 --- a/archivers/telethon_archiver.py +++ b/archivers/telethon_archiver.py @@ -74,7 +74,7 @@ class TelethonArchiver(Archiver): screenshot = self.get_screenshot(url) - if len(media_posts) > 1: + if len(media_posts) > 0: key = self.get_html_key(url) if check_if_exists and self.storage.exists(key): @@ -86,7 +86,7 @@ class TelethonArchiver(Archiver): group_id = post.grouped_id if post.grouped_id is not None else post.id uploaded_media = [] message = post.message - for mp in media_posts: + for i, mp in enumerate(media_posts): if len(mp.message) > len(message): message = mp.message filename_dest = os.path.join(Storage.TMP_FOLDER, f'{chat}_{group_id}', str(mp.id)) filename = self.client.download_media(mp.media, filename_dest) @@ -95,22 +95,13 @@ class TelethonArchiver(Archiver): hash = self.get_hash(filename) cdn_url = self.storage.get_cdn_url(key) uploaded_media.append({'cdn_url': cdn_url, 'key': key, 'hash': hash}) + if i == 0: + key_thumb, thumb_index = self.get_thumbnails(filename, key) os.remove(filename) page_cdn, page_hash, _ = self.generate_media_page_html(url, uploaded_media, html.escape(str(post))) - return ArchiveResult(status=status, cdn_url=page_cdn, title=message, timestamp=post.date, hash=page_hash, screenshot=screenshot) - elif len(media_posts) == 1: - key = self.get_key(f'{chat}_{post_id}') - filename = self.client.download_media(post.media, os.path.join(Storage.TMP_FOLDER, key)) - key = filename.split(Storage.TMP_FOLDER)[1].replace(" ", "") - self.storage.upload(filename, key) - hash = self.get_hash(filename) - cdn_url = self.storage.get_cdn_url(key) - key_thumb, thumb_index = self.get_thumbnails(filename, key) - os.remove(filename) - - return ArchiveResult(status=status, cdn_url=cdn_url, title=post.message, thumbnail=key_thumb, thumbnail_index=thumb_index, timestamp=post.date, hash=hash, screenshot=screenshot) + return ArchiveResult(status=status, cdn_url=page_cdn, title=message, timestamp=post.date, hash=page_hash, screenshot=screenshot, thumbnail=key_thumb, thumbnail_index=thumb_index) page_cdn, page_hash, _ = self.generate_media_page_html(url, [], html.escape(str(post))) return ArchiveResult(status=status, cdn_url=page_cdn, title=post.message, timestamp=getattr_or(post, "date"), hash=page_hash, screenshot=screenshot) From cdd66fb7dad8ca64aaad0dbff9cc9c767070f837 Mon Sep 17 00:00:00 2001 From: msramalho <19508417+msramalho@users.noreply.github.com> Date: Thu, 16 Jun 2022 16:30:08 +0200 Subject: [PATCH 83/84] returning empty string thumbs --- archivers/base_archiver.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/archivers/base_archiver.py b/archivers/base_archiver.py index 0f490fe..a8b0413 100644 --- a/archivers/base_archiver.py +++ b/archivers/base_archiver.py @@ -229,7 +229,7 @@ class Archiver(ABC): cdn_urls.append(cdn_url) if len(cdn_urls) == 0: - return ('None', 'None') + return ('', '') key_thumb = cdn_urls[int(len(cdn_urls) * 0.1)] From 14add4392318c84f2ec626b9297eb0db0518fedb Mon Sep 17 00:00:00 2001 From: msramalho <19508417+msramalho@users.noreply.github.com> Date: Thu, 16 Jun 2022 17:17:25 +0200 Subject: [PATCH 84/84] fixing auto_auto_archive --- README.md | 15 ++++++++------- auto_auto_archive.py | 31 ++++++++++++++++--------------- 2 files changed, 24 insertions(+), 22 deletions(-) diff --git a/README.md b/README.md index 39204a2..e60774c 100644 --- a/README.md +++ b/README.md @@ -140,7 +140,7 @@ With this configuration, the archiver should archive and store all media added t # auto_auto_archiver -To make it easier to set up new auto-archiver sheets, the auto-auto-archiver will look at a particular sheet and run the auto-archiver on every sheet name in column A, starting from row 11. (It starts here to support instructional text in the first rows of the sheet, as shown below.) This script takes one command line argument, with `--sheet`, the name of the sheet. It must be shared with the same service account. +To make it easier to set up new auto-archiver sheets, the auto-auto-archiver will look at a particular sheet and run the auto-archiver on every sheet name in column A, starting from row 11. (It starts here to support instructional text in the first rows of the sheet, as shown below.) You can simply use your default config as for `auto_archiver.py` but use `--sheet` to specify the name of the sheet that lists the names of sheets to archive.It must be shared with the same service account. ![A screenshot of a Google Spreadsheet configured to show instructional text and a list of sheet names to check with auto-archiver.](docs/auto-auto.png) @@ -152,15 +152,16 @@ Code is split into functional concepts: 1. [GWorksheet](utils/gworksheet.py) - facilitates some of the reading/writing tasks for a Google Worksheet ### Current Archivers -Archivers are tested in a meaningful order with Wayback Machine being the default, that can easily be changed in the code. +Archivers are tested in a meaningful order with Wayback Machine being the failsafe, that can easily be changed in the code. ```mermaid graph TD - A(Archiver) -->|parent of| B(YoutubeDLArchiver) - A -->|parent of| C(TikTokArchiver) - A -->|parent of| D(TwitterArchiver) + A(Archiver) -->|parent of| B(TelethonArchiver) + A -->|parent of| C(TiktokArchiver) + A -->|parent of| D(YoutubeDLArchiver) A -->|parent of| E(TelegramArchiver) - A -->|parent of| F(TelethonArchiver) - A -->|parent of| G(WaybackArchiver) + A -->|parent of| F(TwitterArchiver) + A -->|parent of| G(VkArchiver) + A -->|parent of| H(WaybackArchiver) ``` ### Current Storages ```mermaid diff --git a/auto_auto_archive.py b/auto_auto_archive.py index a518204..14bb751 100644 --- a/auto_auto_archive.py +++ b/auto_auto_archive.py @@ -1,29 +1,30 @@ -import gspread -import argparse +import shutil import auto_archive from loguru import logger +from configs import Config +from storages import Storage +from utils import mkdir_if_not_exists + def main(): - parser = argparse.ArgumentParser( - description="Automatically use youtube-dl to download media from a Google Sheet") - parser.add_argument("--sheet", action="store", dest="sheet") + c = Config() + c.parse() + logger.info(f'Opening document {c.sheet} to look for sheet names to archive') - args = parser.parse_args() - - logger.info("Opening document " + args.sheet) - - gc = gspread.service_account(filename='service_account.json') - sh = gc.open(args.sheet) + gc = c.gsheets_client + sh = gc.open(c.sheet) wks = sh.get_worksheet(0) values = wks.get_all_values() + mkdir_if_not_exists(Storage.TMP_FOLDER) for i in range(11, len(values)): - sheet_name = values[i][0] + c.sheet = values[i][0] + logger.info(f"Processing {c.sheet}") + auto_archive.process_sheet(c) + c.destroy_webdriver() + shutil.rmtree(Storage.TMP_FOLDER) - logger.info("Processing " + sheet_name) - - auto_archive.process_sheet(sheet_name) if __name__ == "__main__": main()