diff --git a/src/archivers/__init__.py b/src/archivers/__init__.py index 51d3546..22e142f 100644 --- a/src/archivers/__init__.py +++ b/src/archivers/__init__.py @@ -1,16 +1,16 @@ # we need to explicitly expose the available imports here from .base_archiver import Archiver, ArchiveResult -from .archiver import Archiverv2 # from .telegram_archiver import TelegramArchiver # from .telethon_archiver import TelethonArchiver # from .tiktok_archiver import TiktokArchiver -from .wayback_archiver import WaybackArchiver +# from .wayback_archiver import WaybackArchiver # from .youtubedl_archiver import YoutubeDLArchiver # from .twitter_archiver import TwitterArchiver # from .vk_archiver import VkArchiver # from .twitter_api_archiver import TwitterApiArchiver # from .instagram_archiver import InstagramArchiver +from .archiver import Archiverv2 from .telethon_archiverv2 import TelethonArchiver from .twitter_archiverv2 import TwitterArchiver from .twitter_api_archiverv2 import TwitterApiArchiver diff --git a/src/archivers/archiver.py b/src/archivers/archiver.py index 369dd60..7682e11 100644 --- a/src/archivers/archiver.py +++ b/src/archivers/archiver.py @@ -15,9 +15,8 @@ class Archiverv2(Step): # without this STEP.__init__ is not called super().__init__(config) - # only for typing... - def init(name: str, config: dict) -> Archiverv2: + # only for typing... return Step.init(name, config, Archiverv2) def setup(self) -> None: @@ -58,3 +57,19 @@ class Archiverv2(Step): @abstractmethod def download(self, item: Metadata) -> Metadata: pass + + # TODO: how to fix allow predictable key + # def get_key(self, filename): + # """ + # returns a key in the format "[archiverName]_[filename]" includes extension + # """ + # tail = os.path.split(filename)[1] # returns filename.ext from full path + # _id, extension = os.path.splitext(tail) # returns [filename, .ext] + # if 'unknown_video' in _id: + # _id = _id.replace('unknown_video', 'jpg') + + # # long filenames can cause problems, so trim them if necessary + # if len(_id) > 128: + # _id = _id[-128:] + + # return f'{self.name}_{_id}{extension}' \ No newline at end of file diff --git a/src/archivers/instagram_archiver.py b/src/archivers/instagram_archiver.py deleted file mode 100644 index 62db876..0000000 --- a/src/archivers/instagram_archiver.py +++ /dev/null @@ -1,140 +0,0 @@ -import re, os, shutil, html, traceback -import instaloader # https://instaloader.github.io/as-module.html -from loguru import logger - -from .base_archiver import Archiver, ArchiveResult -from configs import Config -from storages import Storage - - -class InstagramArchiver(Archiver): - """ - Uses Instaloader to download either a post (inc images, videos, text) or as much as possible from a profile (posts, stories, highlights, ) - """ - name = "instagram" - DOWNLOAD_FOLDER = "instaloader" - # NB: post should be tested before profile - # https://regex101.com/r/MGPquX/1 - post_pattern = re.compile(r"(?:(?:http|https):\/\/)?(?:www.)?(?:instagram.com|instagr.am|instagr.com)\/(?:p|reel)\/(\w+)") - # https://regex101.com/r/6Wbsxa/1 - profile_pattern = re.compile(r"(?:(?:http|https):\/\/)?(?:www.)?(?:instagram.com|instagr.am|instagr.com)\/(\w+)") - - def __init__(self, storage: Storage, config: Config): - super().__init__(storage, config) - self.insta = instaloader.Instaloader(download_geotags=True, download_comments=True, compress_json=False, dirname_pattern=self.DOWNLOAD_FOLDER, filename_pattern="{date_utc}_UTC_{target}__{typename}") - if config.instagram_config: - try: - self.insta.load_session_from_file(config.instagram_config.username, config.instagram_config.session_file) - except Exception as e: - logger.error(f"Unable to login from session file: {e}\n{traceback.format_exc()}") - try: - self.insta.login(config.instagram_config.username, config.instagram_config. - password) - #TODO: wait for this issue to be fixed https://github.com/instaloader/instaloader/issues/1758 - self.insta.save_session_to_file(config.instagram_config.session_file) - except Exception as e2: - logger.error(f"Unable to finish login (retrying from file): {e2}\n{traceback.format_exc()}") - - - - def download(self, url, check_if_exists=False): - post_matches = self.post_pattern.findall(url) - profile_matches = self.profile_pattern.findall(url) - - # return if not a valid instagram link - if not len(post_matches) and not len(profile_matches): - return - - # check if already uploaded - key = self.get_html_key(url) - if check_if_exists and self.storage.exists(key): - # only s3 storage supports storage.exists as not implemented on gd - cdn_url = self.storage.get_cdn_url(key) - screenshot = self.get_screenshot(url) - wacz = self.get_wacz(url) - return self.generateArchiveResult(status='already archived', cdn_url=cdn_url, screenshot=screenshot, wacz=wacz) - - try: - # process if post - if len(post_matches): - return self.download_post(url, post_matches[0]) - - # process if profile - if len(profile_matches): - return self.download_profile(url, profile_matches[0]) - finally: - shutil.rmtree(self.DOWNLOAD_FOLDER, ignore_errors=True) - - def download_post(self, url, post_id): - logger.debug(f"Instagram {post_id=} detected in {url=}") - - post = instaloader.Post.from_shortcode(self.insta.context, post_id) - if self.insta.download_post(post, target=post.owner_username): - return self.upload_downloaded_content(url, post.title, post._asdict(), post.date) - - def download_profile(self, url, username): - # gets posts, posts where username is tagged, igtv postss, stories, and highlights - logger.debug(f"Instagram {username=} detected in {url=}") - - profile = instaloader.Profile.from_username(self.insta.context, username) - try: - for post in profile.get_posts(): - try: self.insta.download_post(post, target=f"profile_post_{post.owner_username}") - except Exception as e: logger.error(f"Failed to download post: {post.shortcode}: {e}") - except Exception as e: logger.error(f"Failed profile.get_posts: {e}") - - try: - for post in profile.get_tagged_posts(): - try: self.insta.download_post(post, target=f"tagged_post_{post.owner_username}") - except Exception as e: logger.error(f"Failed to download tagged post: {post.shortcode}: {e}") - except Exception as e: logger.error(f"Failed profile.get_tagged_posts: {e}") - - try: - for post in profile.get_igtv_posts(): - try: self.insta.download_post(post, target=f"igtv_post_{post.owner_username}") - except Exception as e: logger.error(f"Failed to download igtv post: {post.shortcode}: {e}") - except Exception as e: logger.error(f"Failed profile.get_igtv_posts: {e}") - - try: - for story in self.insta.get_stories([profile.userid]): - for item in story.get_items(): - try: self.insta.download_storyitem(item, target=f"story_item_{story.owner_username}") - except Exception as e: logger.error(f"Failed to download story item: {item}: {e}") - except Exception as e: logger.error(f"Failed get_stories: {e}") - - try: - for highlight in self.insta.get_highlights(profile.userid): - for item in highlight.get_items(): - try: self.insta.download_storyitem(item, target=f"highlight_item_{highlight.owner_username}") - except Exception as e: logger.error(f"Failed to download highlight item: {item}: {e}") - except Exception as e: logger.error(f"Failed get_highlights: {e}") - - return self.upload_downloaded_content(url, f"@{username}", profile._asdict(), None) - - def upload_downloaded_content(self, url, title, content, date): - status = "success" - try: - uploaded_media = [] - for f in os.listdir(self.DOWNLOAD_FOLDER): - if os.path.isfile((filename := os.path.join(self.DOWNLOAD_FOLDER, f))): - key = self.get_key(filename) - self.storage.upload(filename, key) - hash = self.get_hash(filename) - cdn_url = self.storage.get_cdn_url(key) - uploaded_media.append({'cdn_url': cdn_url, 'key': key, 'hash': hash}) - assert len(uploaded_media) > 1, "No uploaded media found" - - uploaded_media.sort(key=lambda m:m["key"], reverse=True) - - page_cdn, page_hash, _ = self.generate_media_page_html(url, uploaded_media, html.escape(str(content))) - except Exception as e: - logger.error(f"Could not fetch instagram post {url} due to: {e}") - status = "error" - finally: - shutil.rmtree(self.DOWNLOAD_FOLDER, ignore_errors=True) - - if status == "success": - screenshot = self.get_screenshot(url) - wacz = self.get_wacz(url) - - return self.generateArchiveResult(status=status, cdn_url=page_cdn, title=title, timestamp=date, hash=page_hash, screenshot=screenshot, wacz=wacz) diff --git a/src/archivers/telegram_archiver.py b/src/archivers/telegram_archiver.py deleted file mode 100644 index c6d8747..0000000 --- a/src/archivers/telegram_archiver.py +++ /dev/null @@ -1,89 +0,0 @@ -import os, requests, re - -import html -from bs4 import BeautifulSoup -from loguru import logger - -from .base_archiver import Archiver, ArchiveResult -from storages import Storage - - -class TelegramArchiver(Archiver): - name = "telegram" - - def download(self, url, check_if_exists=False): - # detect URLs that we definitely cannot handle - if 't.me' != self.get_netloc(url): - return False - - headers = { - 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36' - } - status = "success" - - original_url = url - - # TODO: check if we can do this more resilient to variable URLs - if url[-8:] != "?embed=1": - url += "?embed=1" - - screenshot = self.get_screenshot(url) - wacz = self.get_wacz(url) - - t = requests.get(url, headers=headers) - s = BeautifulSoup(t.content, 'html.parser') - video = s.find("video") - - if video is None: - logger.warning("could not find video") - image_tags = s.find_all(class_="js-message_photo") - - images = [] - for im in image_tags: - urls = [u.replace("'", "") for u in re.findall(r'url\((.*?)\)', im['style'])] - images += urls - - page_cdn, page_hash, thumbnail = self.generate_media_page(images, url, html.escape(str(t.content))) - time_elements = s.find_all('time') - timestamp = time_elements[0].get('datetime') if len(time_elements) else None - - return self.generateArchiveResult(status="success", cdn_url=page_cdn, screenshot=screenshot, hash=page_hash, thumbnail=thumbnail, timestamp=timestamp, wacz=wacz) - - video_url = video.get('src') - video_id = video_url.split('/')[-1].split('?')[0] - key = self.get_key(video_id) - - filename = os.path.join(Storage.TMP_FOLDER, key) - - if check_if_exists and self.storage.exists(key): - status = 'already archived' - - v = requests.get(video_url, headers=headers) - - with open(filename, 'wb') as f: - f.write(v.content) - - if status != 'already archived': - self.storage.upload(filename, key) - - hash = self.get_hash(filename) - - # extract duration from HTML - try: - duration = s.find_all('time')[0].contents[0] - if ':' in duration: - duration = float(duration.split( - ':')[0]) * 60 + float(duration.split(':')[1]) - else: - duration = float(duration) - except: - duration = "" - - # process thumbnails - key_thumb, thumb_index = self.get_thumbnails( - filename, key, duration=duration) - os.remove(filename) - - cdn_url = self.storage.get_cdn_url(key) - return self.generateArchiveResult(status=status, cdn_url=cdn_url, thumbnail=key_thumb, thumbnail_index=thumb_index, - duration=duration, title=original_url, timestamp=s.find_all('time')[1].get('datetime'), hash=hash, screenshot=screenshot, wacz=wacz) diff --git a/src/archivers/telethon_archiver.py b/src/archivers/telethon_archiver.py deleted file mode 100644 index a2cbf0a..0000000 --- a/src/archivers/telethon_archiver.py +++ /dev/null @@ -1,125 +0,0 @@ -import os, re, html -from loguru import logger -from telethon.sync import TelegramClient -from telethon.errors import ChannelInvalidError - -from storages import Storage -from .base_archiver import Archiver, ArchiveResult -from configs import Config -from utils import getattr_or - - -class TelethonArchiver(Archiver): - name = "telethon" - link_pattern = re.compile(r"https:\/\/t\.me(\/c){0,1}\/(.+)\/(\d+)") - - def __init__(self, storage: Storage, config: Config): - super().__init__(storage, config) - if config.telegram_config: - c = config.telegram_config - self.client = TelegramClient("./anon.session", c.api_id, c.api_hash) - self.bot_token = c.bot_token - - def _get_media_posts_in_group(self, chat, original_post, max_amp=10): - """ - Searches for Telegram posts that are part of the same group of uploads - The search is conducted around the id of the original post with an amplitude - of `max_amp` both ways - Returns a list of [post] where each post has media and is in the same grouped_id - """ - if getattr_or(original_post, "grouped_id") is None: - return [original_post] if getattr_or(original_post, "media") else [] - - search_ids = [i for i in range(original_post.id - max_amp, original_post.id + max_amp + 1)] - posts = self.client.get_messages(chat, ids=search_ids) - media = [] - for post in posts: - if post is not None and post.grouped_id == original_post.grouped_id and post.media is not None: - media.append(post) - return media - - def download(self, url, check_if_exists=False): - if not hasattr(self, "client"): - logger.warning('Missing Telethon config') - return False - - # detect URLs that we definitely cannot handle - matches = self.link_pattern.findall(url) - if not len(matches): - return False - - status = "success" - - # app will ask (stall for user input!) for phone number and auth code if anon.session not found - with self.client.start(bot_token=self.bot_token): - matches = list(matches[0]) - chat, post_id = matches[1], matches[2] - - post_id = int(post_id) - - try: - post = self.client.get_messages(chat, ids=post_id) - except ValueError as e: - logger.error(f"Could not fetch telegram {url} possibly it's private: {e}") - return False - except ChannelInvalidError as e: - logger.error(f"Could not fetch telegram {url}. This error can be fixed if you setup a bot_token in addition to api_id and api_hash: {e}") - return False - - if post is None: return False - - media_posts = self._get_media_posts_in_group(chat, post) - logger.debug(f'got {len(media_posts)=} for {url=}') - - screenshot = self.get_screenshot(url) - wacz = self.get_wacz(url) - - if len(media_posts) > 0: - key = self.get_html_key(url) - - if check_if_exists and self.storage.exists(key): - # only s3 storage supports storage.exists as not implemented on gd - cdn_url = self.storage.get_cdn_url(key) - return self.generateArchiveResult(status='already archived', cdn_url=cdn_url, title=post.message, timestamp=post.date, screenshot=screenshot, wacz=wacz) - - key_thumb, thumb_index = None, None - group_id = post.grouped_id if post.grouped_id is not None else post.id - uploaded_media = [] - message = post.message - for mp in media_posts: - if len(mp.message) > len(message): message = mp.message - - # media can also be in entities - if mp.entities: - other_media_urls = [e.url for e in mp.entities if hasattr(e, "url") and e.url and self._guess_file_type(e.url) in ["video", "image"]] - logger.debug(f"Got {len(other_media_urls)} other medial urls from {mp.id=}: {other_media_urls}") - for om_url in other_media_urls: - filename = os.path.join(Storage.TMP_FOLDER, f'{chat}_{group_id}_{self._get_key_from_url(om_url)}') - self.download_from_url(om_url, filename) - key = filename.split(Storage.TMP_FOLDER)[1] - self.storage.upload(filename, key) - hash = self.get_hash(filename) - cdn_url = self.storage.get_cdn_url(key) - uploaded_media.append({'cdn_url': cdn_url, 'key': key, 'hash': hash}) - - filename_dest = os.path.join(Storage.TMP_FOLDER, f'{chat}_{group_id}', str(mp.id)) - filename = self.client.download_media(mp.media, filename_dest) - if not filename: - logger.debug(f"Empty media found, skipping {str(mp)=}") - continue - - key = filename.split(Storage.TMP_FOLDER)[1] - self.storage.upload(filename, key) - hash = self.get_hash(filename) - cdn_url = self.storage.get_cdn_url(key) - uploaded_media.append({'cdn_url': cdn_url, 'key': key, 'hash': hash}) - if key_thumb is None: - key_thumb, thumb_index = self.get_thumbnails(filename, key) - os.remove(filename) - - page_cdn, page_hash, _ = self.generate_media_page_html(url, uploaded_media, html.escape(str(post))) - - return self.generateArchiveResult(status=status, cdn_url=page_cdn, title=message, timestamp=post.date, hash=page_hash, screenshot=screenshot, thumbnail=key_thumb, thumbnail_index=thumb_index, wacz=wacz) - - page_cdn, page_hash, _ = self.generate_media_page_html(url, [], html.escape(str(post))) - return self.generateArchiveResult(status=status, cdn_url=page_cdn, title=post.message, timestamp=getattr_or(post, "date"), hash=page_hash, screenshot=screenshot, wacz=wacz) diff --git a/src/archivers/tiktok_archiver.py b/src/archivers/tiktok_archiver.py deleted file mode 100644 index 55cb97e..0000000 --- a/src/archivers/tiktok_archiver.py +++ /dev/null @@ -1,72 +0,0 @@ -import os, traceback -import tiktok_downloader -from loguru import logger - -from .base_archiver import Archiver, ArchiveResult -from storages import Storage - - -class TiktokArchiver(Archiver): - name = "tiktok" - - def download(self, url, check_if_exists=False): - if 'tiktok.com' not in url: - return False - - status = 'success' - - try: - info = tiktok_downloader.info_post(url) - key = self.get_key(f'{info.id}.mp4') - filename = os.path.join(Storage.TMP_FOLDER, key) - logger.info(f'found video {key=}') - - if check_if_exists and self.storage.exists(key): - status = 'already archived' - - media = tiktok_downloader.snaptik(url).get_media() - - if len(media) <= 0: - if status == 'already archived': - return self.generateArchiveResult(status='Could not download media, but already archived', cdn_url=self.storage.get_cdn_url(key)) - else: - return self.generateArchiveResult(status='Could not download media') - - logger.info(f'downloading video {key=}') - media[0].download(filename) - - if status != 'already archived': - logger.info(f'uploading video {key=}') - self.storage.upload(filename, key) - - try: - key_thumb, thumb_index = self.get_thumbnails(filename, key, duration=info.duration) - except Exception as e: - logger.error(e) - key_thumb = '' - thumb_index = 'error creating thumbnails' - - hash = self.get_hash(filename) - screenshot = self.get_screenshot(url) - wacz = self.get_wacz(url) - - try: os.remove(filename) - except FileNotFoundError: - logger.info(f'tmp file not found thus not deleted {filename}') - cdn_url = self.storage.get_cdn_url(key) - timestamp = info.create.isoformat() if hasattr(info, "create") else None - - return self.generateArchiveResult(status=status, cdn_url=cdn_url, thumbnail=key_thumb, - thumbnail_index=thumb_index, duration=getattr(info, "duration", 0), title=getattr(info, "caption", ""), - timestamp=timestamp, hash=hash, screenshot=screenshot, wacz=wacz) - - except tiktok_downloader.Except.InvalidUrl as e: - status = 'Invalid URL' - logger.warning(f'Invalid URL on {url} {e}\n{traceback.format_exc()}') - return self.generateArchiveResult(status=status) - - except: - error = traceback.format_exc() - status = 'Other Tiktok error: ' + str(error) - logger.warning(f'Other Tiktok error' + str(error)) - return self.generateArchiveResult(status=status) diff --git a/src/archivers/twitter_api_archiver.py b/src/archivers/twitter_api_archiver.py deleted file mode 100644 index da56d31..0000000 --- a/src/archivers/twitter_api_archiver.py +++ /dev/null @@ -1,75 +0,0 @@ - -import json -from datetime import datetime -from loguru import logger -from pytwitter import Api - -from storages.base_storage import Storage -from configs import Config -from .base_archiver import ArchiveResult -from .twitter_archiver import TwitterArchiver - - -class TwitterApiArchiver(TwitterArchiver): - name = "twitter_api" - - def __init__(self, storage: Storage, config: Config): - super().__init__(storage, config) - c = config.twitter_config - - if c.bearer_token: - self.api = Api(bearer_token=c.bearer_token) - elif c.consumer_key and c.consumer_secret and c.access_token and c.access_secret: - self.api = Api( - consumer_key=c.consumer_key, consumer_secret=c.consumer_secret, access_token=c.access_token, access_secret=c.access_secret) - - def download(self, url, check_if_exists=False): - if not hasattr(self, "api"): - logger.warning('Missing Twitter API config') - return False - - username, tweet_id = self.get_username_tweet_id(url) - if not username: return False - - tweet = self.api.get_tweet(tweet_id, expansions=["attachments.media_keys"], media_fields=["type", "duration_ms", "url", "variants"], tweet_fields=["attachments", "author_id", "created_at", "entities", "id", "text", "possibly_sensitive"]) - timestamp = datetime.strptime(tweet.data.created_at, "%Y-%m-%dT%H:%M:%S.%fZ") - - # check if exists - key = self.get_html_key(url) - if check_if_exists and self.storage.exists(key): - # only s3 storage supports storage.exists as not implemented on gd - cdn_url = self.storage.get_cdn_url(key) - screenshot = self.get_screenshot(url) - return self.generateArchiveResult(status='already archived', cdn_url=cdn_url, title=tweet.data.text, timestamp=timestamp, screenshot=screenshot) - - urls = [] - if tweet.includes: - for m in tweet.includes.media: - if m.url: - urls.append(m.url) - elif hasattr(m, "variants"): - var_url = self.choose_variant(m.variants) - urls.append(var_url) - else: - urls.append(None) # will trigger error - - for u in urls: - if u is None: - logger.debug(f"Should not have gotten None url for {tweet.includes.media=} so going to download_alternative in twitter_archiver") - return self.download_alternative(url, tweet_id) - logger.debug(f"found {urls=}") - - output = json.dumps({ - "id": tweet.data.id, - "text": tweet.data.text, - "created_at": tweet.data.created_at, - "author_id": tweet.data.author_id, - "geo": tweet.data.geo, - "lang": tweet.data.lang, - "media": urls - }, ensure_ascii=False, indent=4) - - screenshot = self.get_screenshot(url) - wacz = self.get_wacz(url) - page_cdn, page_hash, thumbnail = self.generate_media_page(urls, url, output) - return self.generateArchiveResult(status="success", cdn_url=page_cdn, screenshot=screenshot, hash=page_hash, thumbnail=thumbnail, timestamp=timestamp, title=tweet.data.text, wacz=wacz) diff --git a/src/archivers/twitter_archiver.py b/src/archivers/twitter_archiver.py deleted file mode 100644 index f1f22c0..0000000 --- a/src/archivers/twitter_archiver.py +++ /dev/null @@ -1,105 +0,0 @@ -import html, re, requests -from datetime import datetime -from loguru import logger -from snscrape.modules.twitter import TwitterTweetScraper, Video, Gif, Photo - -from .base_archiver import Archiver, ArchiveResult - -class TwitterArchiver(Archiver): - """ - This Twitter Archiver uses unofficial scraping methods, and it works as - an alternative to TwitterApiArchiver when no API credentials are provided. - """ - - name = "twitter" - link_pattern = re.compile(r"twitter.com\/(?:\#!\/)?(\w+)\/status(?:es)?\/(\d+)") - - def get_username_tweet_id(self, url): - # detect URLs that we definitely cannot handle - matches = self.link_pattern.findall(url) - if not len(matches): return False, False - - username, tweet_id = matches[0] # only one URL supported - logger.debug(f"Found {username=} and {tweet_id=} in {url=}") - - return username, tweet_id - - def download(self, url, check_if_exists=False): - username, tweet_id = self.get_username_tweet_id(url) - if not username: return False - - scr = TwitterTweetScraper(tweet_id) - - try: - tweet = next(scr.get_items()) - except Exception as ex: - logger.warning(f"can't get tweet: {type(ex).__name__} occurred. args: {ex.args}") - return self.download_alternative(url, tweet_id) - - if tweet.media is None: - logger.debug(f'No media found, archiving tweet text only') - screenshot = self.get_screenshot(url) - wacz = self.get_wacz(url) - page_cdn, page_hash, _ = self.generate_media_page_html(url, [], html.escape(tweet.json())) - return self.generateArchiveResult(status="success", cdn_url=page_cdn, title=tweet.content, timestamp=tweet.date, hash=page_hash, screenshot=screenshot, wacz=wacz) - - urls = [] - - for media in tweet.media: - if type(media) == Video: - variant = max( - [v for v in media.variants if v.bitrate], key=lambda v: v.bitrate) - urls.append(variant.url) - elif type(media) == Gif: - urls.append(media.variants[0].url) - elif type(media) == Photo: - urls.append(media.fullUrl.replace('name=large', 'name=orig')) - else: - logger.warning(f"Could not get media URL of {media}") - - page_cdn, page_hash, thumbnail = self.generate_media_page(urls, url, tweet.json()) - - screenshot = self.get_screenshot(url) - wacz = self.get_wacz(url) - - return self.generateArchiveResult(status="success", cdn_url=page_cdn, screenshot=screenshot, hash=page_hash, thumbnail=thumbnail, timestamp=tweet.date, title=tweet.content, wacz=wacz) - - def download_alternative(self, url, tweet_id): - # https://stackoverflow.com/a/71867055/6196010 - logger.debug(f"Trying twitter hack for {url=}") - hack_url = f"https://cdn.syndication.twimg.com/tweet?id={tweet_id}" - r = requests.get(hack_url) - if r.status_code != 200: return False - tweet = r.json() - - urls = [] - for p in tweet["photos"]: - urls.append(p["url"]) - - # 1 tweet has 1 video max - if "video" in tweet: - v = tweet["video"] - urls.append(self.choose_variant(v.get("variants", []))) - - logger.debug(f"Twitter hack got {urls=}") - - timestamp = datetime.strptime(tweet["created_at"], "%Y-%m-%dT%H:%M:%S.%fZ") - screenshot = self.get_screenshot(url) - wacz = self.get_wacz(url) - page_cdn, page_hash, thumbnail = self.generate_media_page(urls, url, r.text) - return self.generateArchiveResult(status="success", cdn_url=page_cdn, screenshot=screenshot, hash=page_hash, thumbnail=thumbnail, timestamp=timestamp, title=tweet["text"], wacz=wacz) - - def choose_variant(self, variants): - # choosing the highest quality possible - variant, width, height = None, 0, 0 - for var in variants: - if var["type"] == "video/mp4": - width_height = re.search(r"\/(\d+)x(\d+)\/", var["src"]) - if width_height: - w, h = int(width_height[1]), int(width_height[2]) - if w > width or h > height: - width, height = w, h - variant = var.get("src", variant) - else: - variant = var.get("src") if not variant else variant - return variant diff --git a/src/archivers/vk_archiver.py b/src/archivers/vk_archiver.py deleted file mode 100644 index 1d38fa9..0000000 --- a/src/archivers/vk_archiver.py +++ /dev/null @@ -1,74 +0,0 @@ -import re, json, mimetypes, os - -from loguru import logger -from vk_url_scraper import VkScraper, DateTimeEncoder - -from storages import Storage -from .base_archiver import Archiver, ArchiveResult -from configs import Config - - -class VkArchiver(Archiver): - """" - VK videos are handled by YTDownloader, this archiver gets posts text and images. - Currently only works for /wall posts - """ - name = "vk" - wall_pattern = re.compile(r"(wall.{0,1}\d+_\d+)") - photo_pattern = re.compile(r"(photo.{0,1}\d+_\d+)") - - def __init__(self, storage: Storage, config: Config): - super().__init__(storage, config) - if config.vk_config != None: - self.vks = VkScraper(config.vk_config.username, config.vk_config.password) - - def download(self, url, check_if_exists=False): - if not hasattr(self, "vks") or self.vks is None: - logger.debug("VK archiver was not supplied with credentials.") - return False - - key = self.get_html_key(url) - # if check_if_exists and self.storage.exists(key): - # screenshot = self.get_screenshot(url) - # cdn_url = self.storage.get_cdn_url(key) - # return self.generateArchiveResult(status="already archived", cdn_url=cdn_url, screenshot=screenshot) - - results = self.vks.scrape(url) # some urls can contain multiple wall/photo/... parts and all will be fetched - if len(results) == 0: - return False - - def dump_payload(p): return json.dumps(p, ensure_ascii=False, indent=4, cls=DateTimeEncoder) - textual_output = "" - title, datetime = results[0]["text"], results[0]["datetime"] - urls_found = [] - for res in results: - textual_output += f"id: {res['id']}
time utc: {res['datetime']}
text: {res['text']}
payload: {dump_payload(res['payload'])}


" - title = res["text"] if len(title) == 0 else title - datetime = res["datetime"] if not datetime else datetime - for attachments in res["attachments"].values(): - urls_found.extend(attachments) - - # we don't call generate_media_page which downloads urls because it cannot download vk video urls - thumbnail, thumbnail_index = None, None - uploaded_media = [] - filenames = self.vks.download_media(results, Storage.TMP_FOLDER) - for filename in filenames: - key = self.get_key(filename) - self.storage.upload(filename, key) - hash = self.get_hash(filename) - cdn_url = self.storage.get_cdn_url(key) - try: - _type = mimetypes.guess_type(filename)[0].split("/")[0] - if _type == "image" and thumbnail is None: - thumbnail = cdn_url - if _type == "video" and (thumbnail is None or thumbnail_index is None): - thumbnail, thumbnail_index = self.get_thumbnails(filename, key) - except Exception as e: - logger.warning(f"failed to get thumb for {filename=} with {e=}") - uploaded_media.append({'cdn_url': cdn_url, 'key': key, 'hash': hash}) - - page_cdn, page_hash, thumbnail = self.generate_media_page_html(url, uploaded_media, textual_output, thumbnail=thumbnail) - # # if multiple wall/photos/videos are present the screenshot will only grab the 1st - screenshot = self.get_screenshot(url) - wacz = self.get_wacz(url) - return self.generateArchiveResult(status="success", cdn_url=page_cdn, screenshot=screenshot, hash=page_hash, thumbnail=thumbnail, thumbnail_index=thumbnail_index, timestamp=datetime, title=title, wacz=wacz) diff --git a/src/archivers/wayback_archiver.py b/src/archivers/wayback_archiver.py deleted file mode 100644 index 1bfa78a..0000000 --- a/src/archivers/wayback_archiver.py +++ /dev/null @@ -1,89 +0,0 @@ -import time, requests - -from loguru import logger -from bs4 import BeautifulSoup - -from storages import Storage -from .base_archiver import Archiver, ArchiveResult -from configs import Config - - -class WaybackArchiver(Archiver): - """ - This archiver could implement a check_if_exists by going to "https://web.archive.org/web/{url}" - but that might not be desirable since the webpage might have been archived a long time ago and thus have changed - """ - name = "wayback" - - def __init__(self, storage: Storage, config: Config): - super(WaybackArchiver, self).__init__(storage, config) - self.config = config.wayback_config - self.seen_urls = {} - - def download(self, url, check_if_exists=False): - if self.config is None: - logger.error('Missing Wayback config') - return False - if check_if_exists: - if url in self.seen_urls: return self.seen_urls[url] - - screenshot = self.get_screenshot(url) - wacz = self.get_wacz(url) - - logger.debug(f"POSTing {url=} to web.archive.org") - ia_headers = { - "Accept": "application/json", - "Authorization": f"LOW {self.config.key}:{self.config.secret}" - } - r = requests.post('https://web.archive.org/save/', headers=ia_headers, data={'url': url}) - - if r.status_code != 200: - logger.warning(f"Internet archive failed with status of {r.status_code}") - return self.generateArchiveResult(status="Internet archive failed", screenshot=screenshot, wacz=wacz) - - if 'job_id' not in r.json() and 'message' in r.json(): - return self.custom_retry(r.json(), screenshot=screenshot, wacz=wacz) - - job_id = r.json()['job_id'] - logger.debug(f"GETting status for {job_id=} on {url=}") - status_r = requests.get(f'https://web.archive.org/save/status/{job_id}', headers=ia_headers) - retries = 0 - - # TODO: make the job queue parallel -> consider propagation of results back to sheet though - # wait 90-120 seconds for the archive job to finish - while (status_r.status_code != 200 or status_r.json()['status'] == 'pending') and retries < 30: - time.sleep(3) - try: - logger.debug(f"GETting status for {job_id=} on {url=} [{retries=}]") - status_r = requests.get(f'https://web.archive.org/save/status/{job_id}', headers=ia_headers) - except: - time.sleep(1) - retries += 1 - - if status_r.status_code != 200: - return self.generateArchiveResult(status=f"Internet archive failed: check https://web.archive.org/save/status/{job_id}", screenshot=screenshot, wacz=wacz) - - status_json = status_r.json() - if status_json['status'] != 'success': - return self.custom_retry(status_json, screenshot=screenshot, wacz=wacz) - - archive_url = f"https://web.archive.org/web/{status_json['timestamp']}/{status_json['original_url']}" - - try: - req = requests.get(archive_url) - parsed = BeautifulSoup(req.content, 'html.parser') - title = parsed.find_all('title')[0].text - if title == 'Wayback Machine': - title = 'Could not get title' - except: - title = "Could not get title" - self.seen_urls[url] = self.generateArchiveResult(status='success', cdn_url=archive_url, title=title, screenshot=screenshot, wacz=wacz) - return self.seen_urls[url] - - def custom_retry(self, json_data, **kwargs): - logger.warning(f"Internet archive failed json \n {json_data}") - if "please try again" in str(json_data).lower(): - return self.signal_retry_in(**kwargs) - if "this host has been already captured" in str(json_data).lower(): - return self.signal_retry_in(**kwargs, min_seconds=86400, max_seconds=129600) # 24h to 36h later - return self.generateArchiveResult(status=f"Internet archive failed: {json_data}", **kwargs) diff --git a/src/archivers/youtubedl_archiver.py b/src/archivers/youtubedl_archiver.py deleted file mode 100644 index e2f27a2..0000000 --- a/src/archivers/youtubedl_archiver.py +++ /dev/null @@ -1,118 +0,0 @@ - -import os, datetime - -import yt_dlp -from loguru import logger - -from .base_archiver import Archiver, ArchiveResult -from storages import Storage -from configs import Config - - -class YoutubeDLArchiver(Archiver): - name = "youtube_dl" - ydl_opts = {'outtmpl': f'{Storage.TMP_FOLDER}%(id)s.%(ext)s', 'quiet': False} - - def __init__(self, storage: Storage, config: Config): - super().__init__(storage, config) - self.fb_cookie = config.facebook_cookie - - def download(self, url, check_if_exists=False): - netloc = self.get_netloc(url) - if netloc in ['facebook.com', 'www.facebook.com'] and self.fb_cookie: - logger.debug('Using Facebook cookie') - yt_dlp.utils.std_headers['cookie'] = self.fb_cookie - - ydl = yt_dlp.YoutubeDL(YoutubeDLArchiver.ydl_opts) - cdn_url = None - status = 'success' - - try: - info = ydl.extract_info(url, download=False) - except yt_dlp.utils.DownloadError as e: - logger.debug(f'No video - Youtube normal control flow: {e}') - return False - except Exception as e: - logger.debug(f'ytdlp exception which is normal for example a facebook page with images only will cause a IndexError: list index out of range. Exception here is: \n {e}') - return False - - if info.get('is_live', False): - logger.warning("Live streaming media, not archiving now") - return self.generateArchiveResult(status="Streaming media") - - if 'twitter.com' in netloc: - if 'https://twitter.com/' in info['webpage_url']: - logger.info('Found https://twitter.com/ in the download url from Twitter') - else: - logger.info('Found a linked video probably in a link in a tweet - not getting that video as there may be images in the tweet') - return False - - if check_if_exists: - if 'entries' in info: - if len(info['entries']) > 1: - logger.warning('YoutubeDLArchiver succeeded but cannot archive channels or pages with multiple videos') - return False - elif len(info['entries']) == 0: - logger.warning( - 'YoutubeDLArchiver succeeded but did not find video') - return False - - filename = ydl.prepare_filename(info['entries'][0]) - else: - filename = ydl.prepare_filename(info) - - key = self.get_key(filename) - - if self.storage.exists(key): - status = 'already archived' - cdn_url = self.storage.get_cdn_url(key) - - # sometimes this results in a different filename, so do this again - info = ydl.extract_info(url, download=True) - - # TODO: add support for multiple videos - if 'entries' in info: - if len(info['entries']) > 1: - logger.warning( - 'YoutubeDLArchiver cannot archive channels or pages with multiple videos') - return False - else: - info = info['entries'][0] - - filename = ydl.prepare_filename(info) - - if not os.path.exists(filename): - filename = filename.split('.')[0] + '.mkv' - - if status != 'already archived': - key = self.get_key(filename) - self.storage.upload(filename, key) - - # filename ='tmp/sDE-qZdi8p8.webm' - # key ='SM0022/youtube_dl_sDE-qZdi8p8.webm' - cdn_url = self.storage.get_cdn_url(key) - - hash = self.get_hash(filename) - screenshot = self.get_screenshot(url) - wacz = self.get_wacz(url) - - # get duration - duration = info.get('duration') - - # get thumbnails - try: - key_thumb, thumb_index = self.get_thumbnails(filename, key, duration=duration) - except: - key_thumb = '' - thumb_index = 'Could not generate thumbnails' - - os.remove(filename) - - timestamp = None - if 'timestamp' in info and info['timestamp'] is not None: - timestamp = datetime.datetime.utcfromtimestamp(info['timestamp']).replace(tzinfo=datetime.timezone.utc).isoformat() - elif 'upload_date' in info and info['upload_date'] is not None: - timestamp = datetime.datetime.strptime(info['upload_date'], '%Y%m%d').replace(tzinfo=datetime.timezone.utc) - - return self.generateArchiveResult(status=status, cdn_url=cdn_url, thumbnail=key_thumb, thumbnail_index=thumb_index, duration=duration, - title=info['title'] if 'title' in info else None, timestamp=timestamp, hash=hash, screenshot=screenshot, wacz=wacz) diff --git a/src/configs/v2config.py b/src/configs/v2config.py index dec3565..7b0820d 100644 --- a/src/configs/v2config.py +++ b/src/configs/v2config.py @@ -57,7 +57,12 @@ class ConfigV2: assert "." not in child.name, f"class prop name cannot contain dots('.'): {child.name}" assert "." not in config, f"config property cannot contain dots('.'): {config}" config_path = f"{child.name}.{config}" - parser.add_argument(f'--{config_path}', action='store', dest=config_path, help=f"{details['help']} (defaults to {details['default']})", choices=details.get("choices", None)) + try: + parser.add_argument(f'--{config_path}', action='store', dest=config_path, help=f"{details['help']} (defaults to {details['default']})", choices=details.get("choices", None)) + except argparse.ArgumentError: + # captures cases when a Step is used in 2 flows, eg: wayback enricher vs wayback archiver + pass + self.defaults[config_path] = details["default"] if "cli_set" in details: self.cli_ops[config_path] = details["cli_set"] @@ -92,7 +97,7 @@ class ConfigV2: self.feeder = Feeder.init(steps.get("feeder", "cli_feeder"), self.config) self.formatter = Formatter.init(steps.get("formatter", "html_formatter"), self.config) self.enrichers = [Enricher.init(e, self.config) for e in steps.get("enrichers", [])] - self.archivers = [Archiverv2.init(e, self.config) for e in steps.get("archivers", [])] + self.archivers = [Archiverv2.init(e, self.config) for e in (steps.get("archivers") or [])] self.databases = [Database.init(e, self.config) for e in steps.get("databases", [])] self.storages = [StorageV2.init(e, self.config) for e in steps.get("storages", [])] diff --git a/src/enrichers/__init__.py b/src/enrichers/__init__.py index 8b9220b..fe9cc68 100644 --- a/src/enrichers/__init__.py +++ b/src/enrichers/__init__.py @@ -1,5 +1,6 @@ from .enricher import Enricher from .screenshot_enricher import ScreenshotEnricher -from .wayback_enricher import WaybackEnricher +from .wayback_enricher import WaybackArchiverEnricher from .hash_enricher import HashEnricher -from .thumbnail_enricher import ThumbnailEnricher \ No newline at end of file +from .thumbnail_enricher import ThumbnailEnricher +from .wacz_enricher import WaczEnricher \ No newline at end of file diff --git a/src/enrichers/wacz_enricher.py b/src/enrichers/wacz_enricher.py new file mode 100644 index 0000000..1fa3191 --- /dev/null +++ b/src/enrichers/wacz_enricher.py @@ -0,0 +1,70 @@ +import os +import shutil +import subprocess +import uuid +from archivers.archiver import Archiverv2 +from media import Media +from . import Enricher +from metadata import Metadata +from loguru import logger +import time, requests + + +class WaczEnricher(Enricher): + """ + Submits the current URL to the webarchive and returns a job_id or completed archive + """ + name = "wacz_enricher" + + def __init__(self, config: dict) -> None: + # without this STEP.__init__ is not called + super().__init__(config) + + @staticmethod + def configs() -> dict: + return { + "profile": {"default": None, "help": "browsertrix-profile (for profile generation see https://github.com/webrecorder/browsertrix-crawler#creating-and-using-browser-profiles)."}, + "timeout": {"default": 90, "help": "timeout for WACZ generation in seconds"}, + } + + def enrich(self, to_enrich: Metadata) -> bool: + # TODO: figure out support for browsertrix in docker + url = to_enrich.get_url() + logger.debug(f"generating WACZ for {url=}") + collection = str(uuid.uuid4())[0:8] + browsertrix_home = os.path.abspath(to_enrich.get_tmp_dir()) + cmd = [ + "docker", "run", + "--rm", # delete container once it has completed running + "-v", f"{browsertrix_home}:/crawls/", + # "-it", # this leads to "the input device is not a TTY" + "webrecorder/browsertrix-crawler", "crawl", + "--url", url, + "--scopeType", "page", + "--generateWACZ", + "--text", + "--collection", collection, + "--behaviors", "autoscroll,autoplay,autofetch,siteSpecific", + "--behaviorTimeout", str(self.timeout), + "--timeout", str(self.timeout) + ] + if self.profile: + profile_fn = os.path.join(browsertrix_home, "profile.tar.gz") + shutil.copyfile(self.profile, profile_fn) + # TODO: test which is right + cmd.extend(["--profile", profile_fn]) + # cmd.extend(["--profile", "/crawls/profile.tar.gz"]) + + try: + logger.info(f"Running browsertrix-crawler: {' '.join(cmd)}") + subprocess.run(cmd, check=True) + except Exception as e: + logger.error(f"WACZ generation failed: {e}") + return False + + filename = os.path.join(browsertrix_home, "collections", collection, f"{collection}.wacz") + if not os.path.exists(filename): + logger.warning(f"Unable to locate and upload WACZ {filename=}") + return False + + to_enrich.add_media(Media(filename), "browsertrix") diff --git a/src/enrichers/wayback_enricher.py b/src/enrichers/wayback_enricher.py index 429f218..db53a08 100644 --- a/src/enrichers/wayback_enricher.py +++ b/src/enrichers/wayback_enricher.py @@ -1,16 +1,15 @@ -from utils import Webdriver +from archivers.archiver import Archiverv2 from . import Enricher from metadata import Metadata from loguru import logger -from selenium.common.exceptions import TimeoutException import time, requests -class WaybackEnricher(Enricher): +class WaybackArchiverEnricher(Enricher, Archiverv2): """ Submits the current URL to the webarchive and returns a job_id or completed archive """ - name = "wayback_enricher" + name = "wayback_archiver_enricher" def __init__(self, config: dict) -> None: # without this STEP.__init__ is not called @@ -26,9 +25,19 @@ class WaybackEnricher(Enricher): "secret": {"default": None, "help": "wayback API secret. to get credentials visit https://archive.org/account/s3.php"} } - def enrich(self, to_enrich: Metadata) -> None: + def download(self, item: Metadata) -> Metadata: + result = Metadata() + result.merge(item) + if self.enrich(result): + return result.success("wayback") + + def enrich(self, to_enrich: Metadata) -> bool: url = to_enrich.get_url() - logger.debug(f"Enriching wayback for {url=}") + logger.debug(f"calling wayback for {url=}") + + if to_enrich.get("wayback"): + logger.info(f"Wayback enricher had already been executed: {to_enrich.get('wayback')}") + return True ia_headers = { "Accept": "application/json", @@ -39,10 +48,13 @@ class WaybackEnricher(Enricher): if r.status_code != 200: logger.error(em := f"Internet archive failed with status of {r.status_code}: {r.json()}") to_enrich.set("wayback", em) - return + return False # check job status - job_id = r.json()['job_id'] + job_id = r.json().get('job_id') + if not job_id: + logger.error(f"Wayback failed with {r.json()}") + return False # waits at most timeout seconds until job is completed, otherwise only enriches the job_id information start_time = time.time() @@ -50,12 +62,15 @@ class WaybackEnricher(Enricher): attempt = 1 while not wayback_url and time.time() - start_time <= self.timeout: try: - logger.debug(f"GETting status for {job_id=} on {url=} ({attempt=})") r_status = requests.get(f'https://web.archive.org/save/status/{job_id}', headers=ia_headers) r_json = r_status.json() if r_status.status_code == 200 and r_json['status'] == 'success': wayback_url = f"https://web.archive.org/web/{r_json['timestamp']}/{r_json['original_url']}" + elif r_status.status_code != 200 or r_json['status'] != 'pending': + logger.error(f"Wayback failed with {r_json}") + return False + except Exception as e: logger.warning(f"error fetching status for {url=} due to: {e}") if not wayback_url: @@ -66,4 +81,5 @@ class WaybackEnricher(Enricher): to_enrich.set("wayback", wayback_url) else: to_enrich.set("wayback", {"job_id": job_id, "check_status": f'https://web.archive.org/save/status/{job_id}'}) - to_enrich.set("wayback lookup", f"https://web.archive.org/web/*/{url}") + to_enrich.set("check wayback", f"https://web.archive.org/web/*/{url}") + return True diff --git a/src/formatters/templates/html_template.html b/src/formatters/templates/html_template.html index 9c3b54e..4855441 100644 --- a/src/formatters/templates/html_template.html +++ b/src/formatters/templates/html_template.html @@ -162,7 +162,6 @@ {% endfor %} -

Made with bellingcat/auto-archiver