diff --git a/.example.env b/.example.env index deb01a2..4a200cf 100644 --- a/.example.env +++ b/.example.env @@ -5,4 +5,6 @@ DO_BUCKET= INTERNET_ARCHIVE_S3_KEY= INTERNET_ARCHIVE_S3_SECRET= TELEGRAM_API_ID= -TELEGRAM_API_HASH= \ No newline at end of file +TELEGRAM_API_HASH= + +FACEBOOK_COOKIE=cookie: datr= xxxx \ No newline at end of file diff --git a/archivers/twitter_archiver.py b/archivers/twitter_archiver.py index 49ab5ae..099d279 100644 --- a/archivers/twitter_archiver.py +++ b/archivers/twitter_archiver.py @@ -41,7 +41,7 @@ class TwitterArchiver(Archiver): elif type(media) == Gif: urls.append(media.variants[0].url) elif type(media) == Photo: - urls.append(media.fullUrl) + urls.append(media.fullUrl.replace('name=large', 'name=orig')) else: logger.warning(f"Could not get media URL of {media}") diff --git a/archivers/youtubedl_archiver.py b/archivers/youtubedl_archiver.py index bc0456e..426641a 100644 --- a/archivers/youtubedl_archiver.py +++ b/archivers/youtubedl_archiver.py @@ -11,11 +11,15 @@ class YoutubeDLArchiver(Archiver): name = "youtube_dl" ydl_opts = {'outtmpl': f'{Storage.TMP_FOLDER}%(id)s.%(ext)s', 'quiet': False} + def __init__(self, storage: Storage, driver, fb_cookie): + super().__init__(storage, driver) + self.fb_cookie = fb_cookie + def download(self, url, check_if_exists=False): netloc = self.get_netloc(url) - if netloc in ['facebook.com', 'www.facebook.com'] and os.getenv('FB_COOKIE'): - logger.info('Using Facebook cookie') - yt_dlp.utils.std_headers['cookie'] = os.getenv('FB_COOKIE') + if netloc in ['facebook.com', 'www.facebook.com']: + logger.debug('Using Facebook cookie') + yt_dlp.utils.std_headers['cookie'] = self.fb_cookie ydl = yt_dlp.YoutubeDL(YoutubeDLArchiver.ydl_opts) cdn_url = None @@ -30,6 +34,13 @@ class YoutubeDLArchiver(Archiver): if info.get('is_live', False): logger.warning("Live streaming media, not archiving now") return ArchiveResult(status="Streaming media") + if 'twitter.com' in netloc: + if 'https://twitter.com/' in info['webpage_url']: + logger.info('Found https://twitter.com/ in the download url from Twitter') + else: + logger.info('Found a linked video probably in a link in a tweet - not getting that video as there may be images in the tweet') + return False + if check_if_exists: if 'entries' in info: diff --git a/auto_archive.py b/auto_archive.py index 3af64c2..287e231 100644 --- a/auto_archive.py +++ b/auto_archive.py @@ -12,6 +12,13 @@ import traceback from archivers import TelethonArchiver, TelegramArchiver, TiktokArchiver, YoutubeDLArchiver, TwitterArchiver, WaybackArchiver, ArchiveResult from utils import GWorksheet, mkdir_if_not_exists, expand_url from configs import Config +import sys + +logger.add("logs/1trace.log", level="TRACE") +logger.add("logs/2info.log", level="INFO") +logger.add("logs/3success.log", level="SUCCESS") +logger.add("logs/4warning.log", level="WARNING") +logger.add("logs/5error.log", level="ERROR") load_dotenv() @@ -55,8 +62,8 @@ def process_sheet(c: Config, sheet, header=1, columns=GWorksheet.COLUMN_NAMES): # loop through worksheets to check for ii, wks in enumerate(sh.worksheets()): - logger.info(f'Opening worksheet {ii}: "{wks.title}" header={c.header}') - gw = GWorksheet(wks, header_row=c.header, columns=c.column_names) + logger.info(f'Opening worksheet {ii=}: {wks.title=} {header=}') + gw = GWorksheet(wks, header_row=header, columns=columns) if not gw.col_exists('url'): logger.warning( @@ -80,6 +87,7 @@ def process_sheet(c: Config, sheet, header=1, columns=GWorksheet.COLUMN_NAMES): YoutubeDLArchiver(storage, c.webdriver), TwitterArchiver(storage, c.webdriver), WaybackArchiver(storage, c.webdriver) + archivers.YoutubeDLArchiver(s3_client, driver, os.getenv('FACEBOOK_COOKIE')), ] # loop through rows in worksheet @@ -91,7 +99,17 @@ def process_sheet(c: Config, sheet, header=1, columns=GWorksheet.COLUMN_NAMES): gw.set_cell(row, 'status', 'Archive in progress') url = expand_url(url) + + # make a new driver so each spreadsheet row is idempotent + options = webdriver.FirefoxOptions() + options.headless = True + options.set_preference('network.protocol-handler.external.tg', False) + + driver = webdriver.Firefox(options=options) + driver.set_window_size(1400, 2000) + # in seconds, telegram screenshots catch which don't come back + driver.set_page_load_timeout(120) for archiver in active_archivers: logger.debug(f'Trying {archiver} on row {row}') @@ -112,15 +130,17 @@ def process_sheet(c: Config, sheet, header=1, columns=GWorksheet.COLUMN_NAMES): f'{archiver} did not succeed on row {row}, final status: {result.status}') result.status = archiver.name + \ ": " + str(result.status) - + # get rid of driver so can reload on next row + driver.quit() if result: update_sheet(gw, row, result) else: gw.set_cell(row, 'status', 'failed: no archiver') logger.success(f'Finshed worksheet {wks.title}') - +@logger.catch def main(): + logger.debug(f'Passed args:{sys.argv}') c = Config() c.parse()