diff --git a/archivers/youtubedl_archiver.py b/archivers/youtubedl_archiver.py index 426641a..c83c7e3 100644 --- a/archivers/youtubedl_archiver.py +++ b/archivers/youtubedl_archiver.py @@ -17,7 +17,7 @@ class YoutubeDLArchiver(Archiver): def download(self, url, check_if_exists=False): netloc = self.get_netloc(url) - if netloc in ['facebook.com', 'www.facebook.com']: + if netloc in ['facebook.com', 'www.facebook.com'] and self.fb_cookie: logger.debug('Using Facebook cookie') yt_dlp.utils.std_headers['cookie'] = self.fb_cookie diff --git a/auto_archive.py b/auto_archive.py index 287e231..749e912 100644 --- a/auto_archive.py +++ b/auto_archive.py @@ -1,18 +1,14 @@ -# import os +import sys import datetime -# import argparse import shutil -# import gspread from loguru import logger from dotenv import load_dotenv import traceback -# import archivers from archivers import TelethonArchiver, TelegramArchiver, TiktokArchiver, YoutubeDLArchiver, TwitterArchiver, WaybackArchiver, ArchiveResult from utils import GWorksheet, mkdir_if_not_exists, expand_url from configs import Config -import sys logger.add("logs/1trace.log", level="TRACE") logger.add("logs/2info.log", level="INFO") @@ -79,16 +75,6 @@ def process_sheet(c: Config, sheet, header=1, columns=GWorksheet.COLUMN_NAMES): c.set_folder(f'{sheet.replace(" ", "_")}/{wks.title.replace(" ", "_")}/') storage = c.get_storage() - # order matters, first to succeed excludes remaining - active_archivers = [ - TelethonArchiver(storage, c.webdriver, c.telegram_config), - TelegramArchiver(storage, c.webdriver), - TiktokArchiver(storage, c.webdriver), - YoutubeDLArchiver(storage, c.webdriver), - TwitterArchiver(storage, c.webdriver), - WaybackArchiver(storage, c.webdriver) - archivers.YoutubeDLArchiver(s3_client, driver, os.getenv('FACEBOOK_COOKIE')), - ] # loop through rows in worksheet for row in range(1 + header, gw.count_rows() + 1): @@ -99,17 +85,19 @@ def process_sheet(c: Config, sheet, header=1, columns=GWorksheet.COLUMN_NAMES): gw.set_cell(row, 'status', 'Archive in progress') url = expand_url(url) - - # make a new driver so each spreadsheet row is idempotent - options = webdriver.FirefoxOptions() - options.headless = True - options.set_preference('network.protocol-handler.external.tg', False) + c.recreate_webdriver() + + # order matters, first to succeed excludes remaining + active_archivers = [ + TelethonArchiver(storage, c.webdriver, c.telegram_config), + TelegramArchiver(storage, c.webdriver), + TiktokArchiver(storage, c.webdriver), + YoutubeDLArchiver(storage, c.webdriver, c.facebook_cookie), + TwitterArchiver(storage, c.webdriver), + WaybackArchiver(storage, c.webdriver) + ] - driver = webdriver.Firefox(options=options) - driver.set_window_size(1400, 2000) - # in seconds, telegram screenshots catch which don't come back - driver.set_page_load_timeout(120) for archiver in active_archivers: logger.debug(f'Trying {archiver} on row {row}') @@ -121,23 +109,19 @@ def process_sheet(c: Config, sheet, header=1, columns=GWorksheet.COLUMN_NAMES): if result: if result.status in ['success', 'already archived']: - result.status = archiver.name + \ - ": " + str(result.status) - logger.success( - f'{archiver} succeeded on row {row}') + result.status = f"{archiver.name}: {result.status}" + logger.success(f'{archiver} succeeded on row {row}') break - logger.warning( - f'{archiver} did not succeed on row {row}, final status: {result.status}') - result.status = archiver.name + \ - ": " + str(result.status) - # get rid of driver so can reload on next row - driver.quit() + logger.warning(f'{archiver} did not succeed on row {row}, final status: {result.status}') + result.status = f"{archiver.name}: {result.status}" + if result: update_sheet(gw, row, result) else: gw.set_cell(row, 'status', 'failed: no archiver') logger.success(f'Finshed worksheet {wks.title}') + @logger.catch def main(): logger.debug(f'Passed args:{sys.argv}') diff --git a/configs/config.py b/configs/config.py index d5ef5ad..3ba5bcb 100644 --- a/configs/config.py +++ b/configs/config.py @@ -61,15 +61,20 @@ class Config: # selenium driver selenium_configs = execution.get("selenium", {}) self.selenium_timeout = int(selenium_configs.get("timeout_seconds", 10)) - options = webdriver.FirefoxOptions() - options.headless = True - options.set_preference('network.protocol-handler.external.tg', False) - self.webdriver = webdriver.Firefox(options=options) - self.webdriver.set_window_size(1400, 2000) - self.webdriver.set_page_load_timeout(self.selenium_timeout) + self.webdriver = "not initalized" - secrets = self.config.get("secrets", {}) # APIs and service configurations + secrets = self.config.get("secrets", {}) + + # google sheets config + self.gsheets_client = gspread.service_account( + filename=secrets.get("google_api", {}).get("filename", 'service_account.json') + ) + + # facebook config + self.facebook_cookie = secrets.get("facebook", {}).get("cookie", None) + + # s3 config if "s3" in secrets: s3 = secrets["s3"] self.s3_config = S3Config( @@ -86,6 +91,7 @@ class Config: else: logger.debug(f"'s3' key not present in the {self.config_file=}") + # wayback machine config if "wayback" in secrets: self.wayback_config = WaybackConfig( key=secrets["wayback"]["key"], @@ -94,6 +100,7 @@ class Config: else: logger.debug(f"'wayback' key not present in the {self.config_file=}") + # telethon config if "telegram" in secrets: self.telegram_config = TelegramConfig( api_id=secrets["telegram"]["api_id"], @@ -102,10 +109,6 @@ class Config: else: logger.debug(f"'telegram' key not present in the {self.config_file=}") - self.gsheets_client = gspread.service_account( - filename=secrets.get("google_api", {}).get("filename", 'service_account.json') - ) - del self.config["secrets"] def get_argument_parser(self): @@ -133,6 +136,17 @@ class Config: return LocalStorage(self.folder) raise f"storage {self.storage} not yet implemented" + def destroy_webdriver(self): + if self.webdriver is not None: + self.webdriver.quit() + + def recreate_webdriver(self): + options = webdriver.FirefoxOptions() + options.headless = True + options.set_preference('network.protocol-handler.external.tg', False) + self.webdriver = webdriver.Firefox(options=options) + self.webdriver.set_window_size(1400, 2000) + self.webdriver.set_page_load_timeout(self.selenium_timeout) def __str__(self) -> str: return json.dumps({ diff --git a/test.py b/test.py new file mode 100644 index 0000000..4061c9f --- /dev/null +++ b/test.py @@ -0,0 +1,51 @@ +import os +import datetime +import argparse +import requests +import shutil +import gspread +from loguru import logger +from dotenv import load_dotenv +from selenium import webdriver +import traceback + +import archivers +from storages import S3Storage, S3Config +from utils import GWorksheet, mkdir_if_not_exists + +load_dotenv() + + +options = webdriver.FirefoxOptions() +options.headless = True +driver = webdriver.Firefox(options=options) +driver.set_window_size(1400, 2000) + +s3_config = S3Config( + bucket=os.getenv('DO_BUCKET'), + region=os.getenv('DO_SPACES_REGION'), + key=os.getenv('DO_SPACES_KEY'), + secret=os.getenv('DO_SPACES_SECRET'), + folder="temp" +) +s3_client = S3Storage(s3_config) +telegram_config = archivers.TelegramConfig( + api_id=os.getenv('TELEGRAM_API_ID'), + api_hash=os.getenv('TELEGRAM_API_HASH') +) + +archiver = archivers.TelethonArchiver(s3_client, driver, telegram_config) + +URLs = [ + # "https://t.me/c/1226032830/24864", + # "https://t.me/truexanewsua/32650", + "https://t.me/informatsia_obstanovka/5239", + # "https://t.me/informatsia_obstanovka/5240", + # "https://t.me/informatsia_obstanovka/5241", + # "https://t.me/informatsia_obstanovka/5242" +] + + +for url in URLs: + print(url) + print(archiver.download(url, False))