mirror of
https://github.com/bellingcat/auto-archiver.git
synced 2026-06-13 05:38:29 +03:00
Save to folders for S3 and GD. Google Drive (GD) storage
This commit is contained in:
@@ -14,6 +14,9 @@ from selenium.common.exceptions import TimeoutException
|
||||
from storages import Storage
|
||||
from utils import mkdir_if_not_exists
|
||||
|
||||
from selenium.webdriver.common.by import By
|
||||
from loguru import logger
|
||||
from selenium.common.exceptions import TimeoutException
|
||||
|
||||
@dataclass
|
||||
class ArchiveResult:
|
||||
@@ -39,7 +42,7 @@ class Archiver(ABC):
|
||||
return self.__class__.__name__
|
||||
|
||||
@abstractmethod
|
||||
def download(self, url, check_if_exists=False): pass
|
||||
def download(self, url, check_if_exists=False, filenumber=None): pass
|
||||
|
||||
def get_netloc(self, url):
|
||||
return urlparse(url).netloc
|
||||
@@ -47,7 +50,8 @@ class Archiver(ABC):
|
||||
def get_html_key(self, url):
|
||||
return self.get_key(urlparse(url).path.replace("/", "_") + ".html")
|
||||
|
||||
def generate_media_page_html(self, url, urls_info: dict, object, thumbnail=None):
|
||||
# generate the html page eg SM3013/twitter__minmyatnaing13_status_1499415562937503751.html
|
||||
def generate_media_page_html(self, url, urls_info: dict, object, thumbnail=None, filenumber=None):
|
||||
page = f'''<html><head><title>{url}</title><meta charset="UTF-8"></head>
|
||||
<body>
|
||||
<h2>Archived media from {self.name}</h2>
|
||||
@@ -61,18 +65,24 @@ class Archiver(ABC):
|
||||
|
||||
page_key = self.get_key(urlparse(url).path.replace("/", "_") + ".html")
|
||||
page_filename = 'tmp/' + page_key
|
||||
page_cdn = self.storage.get_cdn_url(page_key)
|
||||
|
||||
with open(page_filename, "w") as f:
|
||||
f.write(page)
|
||||
|
||||
page_hash = self.get_hash(page_filename)
|
||||
|
||||
if filenumber != None:
|
||||
logger.trace(f'filenumber for directory is {filenumber}')
|
||||
page_key = filenumber + "/" + page_key
|
||||
|
||||
self.storage.upload(page_filename, page_key, extra_args={
|
||||
'ACL': 'public-read', 'ContentType': 'text/html'})
|
||||
|
||||
page_cdn = self.storage.get_cdn_url(page_key)
|
||||
return (page_cdn, page_hash, thumbnail)
|
||||
|
||||
def generate_media_page(self, urls, url, object):
|
||||
# eg images in a tweet save to cloud storage
|
||||
def generate_media_page(self, urls, url, object, filenumber=None):
|
||||
headers = {
|
||||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36'
|
||||
}
|
||||
@@ -87,19 +97,30 @@ class Archiver(ABC):
|
||||
|
||||
filename = 'tmp/' + key
|
||||
|
||||
# eg media_url: https://pbs.twimg.com/media/FM7-ggCUYAQHKWW?format=jpg&name=orig
|
||||
d = requests.get(media_url, headers=headers)
|
||||
with open(filename, 'wb') as f:
|
||||
f.write(d.content)
|
||||
|
||||
if filenumber is not None:
|
||||
logger.debug(f'filenumber for directory is {filenumber}')
|
||||
key = filenumber + "/" + key
|
||||
|
||||
# eg filename: 'tmp/twitter__media_FM7-ggCUYAQHKWW.jpg'
|
||||
# eg key: 'twitter__media_FM7-ggCUYAQHKWW.jpg'
|
||||
# or if using filename key: 'SM3013/twitter__media_FM7-ggCUYAQHKWW.jpg'
|
||||
self.storage.upload(filename, key)
|
||||
|
||||
hash = self.get_hash(filename)
|
||||
|
||||
# eg 'https://testhashing.fra1.cdn.digitaloceanspaces.com/Test_Hashing/Sheet1/twitter__media_FM7-ggCUYAQHKWW.jpg'
|
||||
cdn_url = self.storage.get_cdn_url(key)
|
||||
|
||||
if thumbnail is None:
|
||||
thumbnail = cdn_url
|
||||
uploaded_media.append({'cdn_url': cdn_url, 'key': key, 'hash': hash})
|
||||
|
||||
return self.generate_media_page_html(url, uploaded_media, object, thumbnail=thumbnail)
|
||||
return self.generate_media_page_html(url, uploaded_media, object, thumbnail=thumbnail, filenumber=filenumber)
|
||||
|
||||
def get_key(self, filename):
|
||||
"""
|
||||
@@ -119,15 +140,33 @@ class Archiver(ABC):
|
||||
def get_hash(self, filename):
|
||||
f = open(filename, "rb")
|
||||
bytes = f.read() # read entire file as bytes
|
||||
|
||||
hash = hashlib.sha256(bytes)
|
||||
# option to use SHA3_512 instead
|
||||
# hash = hashlib.sha3_512(bytes)
|
||||
f.close()
|
||||
return hash.hexdigest()
|
||||
|
||||
def get_screenshot(self, url):
|
||||
# eg SA3013/twitter__minmyatnaing13_status_14994155629375037512022-04-27T13:51:43.701962.png
|
||||
# def get_screenshot(self, url, filenumber, storage="GD"):
|
||||
def get_screenshot(self, url, filenumber):
|
||||
key = self.get_key(urlparse(url).path.replace(
|
||||
"/", "_") + datetime.datetime.utcnow().isoformat().replace(" ", "_") + ".png")
|
||||
filename = 'tmp/' + key
|
||||
|
||||
# Accept cookies popup dismiss for ytdlp video
|
||||
if 'facebook.com' in url:
|
||||
try:
|
||||
logger.debug(f'Trying fb click accept cookie popup for {url}')
|
||||
self.driver.get("http://www.facebook.com")
|
||||
foo = self.driver.find_element(By.XPATH,"//button[@data-cookiebanner='accept_only_essential_button']")
|
||||
foo.click()
|
||||
logger.debug(f'fb click worked')
|
||||
# linux server needs a sleep otherwise facebook cookie wont have worked and we'll get a popup on next page
|
||||
time.sleep(2)
|
||||
except:
|
||||
logger.warning(f'Failed on fb accept cookies for url {url}')
|
||||
|
||||
try:
|
||||
self.driver.get(url)
|
||||
time.sleep(6)
|
||||
@@ -135,8 +174,14 @@ class Archiver(ABC):
|
||||
logger.info("TimeoutException loading page for screenshot")
|
||||
|
||||
self.driver.save_screenshot(filename)
|
||||
|
||||
if filenumber is not None:
|
||||
logger.debug(f'filenumber for directory is {filenumber}')
|
||||
key = filenumber + "/" + key
|
||||
|
||||
self.storage.upload(filename, key, extra_args={
|
||||
'ACL': 'public-read', 'ContentType': 'image/png'})
|
||||
|
||||
return self.storage.get_cdn_url(key)
|
||||
|
||||
def get_thumbnails(self, filename, key, duration=None):
|
||||
@@ -167,10 +212,9 @@ class Archiver(ABC):
|
||||
thumbnail_filename = thumbnails_folder + fname
|
||||
key = key_folder + fname
|
||||
|
||||
cdn_url = self.storage.get_cdn_url(key)
|
||||
|
||||
self.storage.upload(thumbnail_filename, key)
|
||||
|
||||
cdn_url = self.storage.get_cdn_url(key)
|
||||
cdn_urls.append(cdn_url)
|
||||
|
||||
if len(cdn_urls) == 0:
|
||||
|
||||
@@ -11,7 +11,7 @@ from .base_archiver import Archiver, ArchiveResult
|
||||
class TelegramArchiver(Archiver):
|
||||
name = "telegram"
|
||||
|
||||
def download(self, url, check_if_exists=False):
|
||||
def download(self, url, check_if_exists=False, filenumber=None):
|
||||
# detect URLs that we definitely cannot handle
|
||||
if 't.me' != self.get_netloc(url):
|
||||
return False
|
||||
@@ -27,7 +27,7 @@ class TelegramArchiver(Archiver):
|
||||
if url[-8:] != "?embed=1":
|
||||
url += "?embed=1"
|
||||
|
||||
screenshot = self.get_screenshot(url)
|
||||
screenshot = self.get_screenshot(url, filenumber=filenumber)
|
||||
|
||||
t = requests.get(url, headers=headers)
|
||||
s = BeautifulSoup(t.content, 'html.parser')
|
||||
@@ -42,7 +42,7 @@ class TelegramArchiver(Archiver):
|
||||
urls = [u.replace("'", "") for u in re.findall(r'url\((.*?)\)', im['style'])]
|
||||
images += urls
|
||||
|
||||
page_cdn, page_hash, thumbnail = self.generate_media_page(images, url, html.escape(str(t.content)))
|
||||
page_cdn, page_hash, thumbnail = self.generate_media_page(images, url, html.escape(str(t.content)),filenumber=filenumber)
|
||||
time_elements = s.find_all('time')
|
||||
timestamp = time_elements[0].get('datetime') if len(time_elements) else None
|
||||
|
||||
@@ -52,6 +52,9 @@ class TelegramArchiver(Archiver):
|
||||
video_id = video_url.split('/')[-1].split('?')[0]
|
||||
key = self.get_key(video_id)
|
||||
|
||||
if filenumber is not None:
|
||||
key = filenumber + "/" + key
|
||||
|
||||
filename = 'tmp/' + key
|
||||
cdn_url = self.storage.get_cdn_url(key)
|
||||
|
||||
|
||||
@@ -41,20 +41,22 @@ class TelethonArchiver(Archiver):
|
||||
media.append(post)
|
||||
return media
|
||||
|
||||
def download(self, url, check_if_exists=False):
|
||||
def download(self, url, check_if_exists=False, filenumber=None):
|
||||
# detect URLs that we definitely cannot handle
|
||||
matches = self.link_pattern.findall(url)
|
||||
if not len(matches):
|
||||
return False
|
||||
|
||||
status = "success"
|
||||
screenshot = self.get_screenshot(url)
|
||||
screenshot = self.get_screenshot(url, filenumber)
|
||||
|
||||
# app will ask (stall for user input!) for phone number and auth code if anon.session not found
|
||||
with self.client.start():
|
||||
matches = list(matches[0])
|
||||
chat, post_id = matches[1], matches[2]
|
||||
|
||||
post_id = int(post_id)
|
||||
|
||||
try:
|
||||
post = self.client.get_messages(chat, ids=post_id)
|
||||
except ValueError as e:
|
||||
@@ -65,9 +67,13 @@ class TelethonArchiver(Archiver):
|
||||
|
||||
if len(media_posts) > 1:
|
||||
key = self.get_html_key(url)
|
||||
cdn_url = self.storage.get_cdn_url(key)
|
||||
|
||||
if filenumber is not None:
|
||||
key = filenumber + "/" + key
|
||||
|
||||
if check_if_exists and self.storage.exists(key):
|
||||
# only s3 storage supports storage.exists as not implemented on gd
|
||||
cdn_url = self.storage.get_cdn_url(key)
|
||||
status = 'already archived'
|
||||
return ArchiveResult(status='already archived', cdn_url=cdn_url, title=post.message, timestamp=post.date, screenshot=screenshot)
|
||||
|
||||
@@ -78,19 +84,26 @@ class TelethonArchiver(Archiver):
|
||||
if len(mp.message) > len(message): message = mp.message
|
||||
filename = self.client.download_media(mp.media, f'tmp/{chat}_{group_id}/{mp.id}')
|
||||
key = filename.split('tmp/')[1]
|
||||
|
||||
if filenumber is not None:
|
||||
key = filenumber + "/" + key
|
||||
self.storage.upload(filename, key)
|
||||
hash = self.get_hash(filename)
|
||||
cdn_url = self.storage.get_cdn_url(key)
|
||||
uploaded_media.append({'cdn_url': cdn_url, 'key': key, 'hash': hash})
|
||||
os.remove(filename)
|
||||
|
||||
page_cdn, page_hash, _ = self.generate_media_page_html(url, uploaded_media, html.escape(str(post)))
|
||||
page_cdn, page_hash, _ = self.generate_media_page_html(url, uploaded_media, html.escape(str(post)), filenumber=filenumber)
|
||||
|
||||
return ArchiveResult(status=status, cdn_url=page_cdn, title=post.message, timestamp=post.date, hash=page_hash, screenshot=screenshot)
|
||||
elif len(media_posts) == 1:
|
||||
key = self.get_key(f'{chat}_{post_id}')
|
||||
filename = self.client.download_media(post.media, f'tmp/{key}')
|
||||
key = filename.split('tmp/')[1].replace(" ", "")
|
||||
|
||||
if filenumber is not None:
|
||||
key = filenumber + "/" + key
|
||||
|
||||
self.storage.upload(filename, key)
|
||||
hash = self.get_hash(filename)
|
||||
cdn_url = self.storage.get_cdn_url(key)
|
||||
@@ -99,5 +112,5 @@ class TelethonArchiver(Archiver):
|
||||
|
||||
return ArchiveResult(status=status, cdn_url=cdn_url, title=post.message, thumbnail=key_thumb, thumbnail_index=thumb_index, timestamp=post.date, hash=hash, screenshot=screenshot)
|
||||
|
||||
page_cdn, page_hash, _ = self.generate_media_page_html(url, [], html.escape(str(post)))
|
||||
page_cdn, page_hash, _ = self.generate_media_page_html(url, [], html.escape(str(post)), filenumber=filenumber)
|
||||
return ArchiveResult(status=status, cdn_url=page_cdn, title=post.message, timestamp=post.date, hash=page_hash, screenshot=screenshot)
|
||||
|
||||
@@ -8,7 +8,7 @@ from .base_archiver import Archiver, ArchiveResult
|
||||
class TiktokArchiver(Archiver):
|
||||
name = "tiktok"
|
||||
|
||||
def download(self, url, check_if_exists=False):
|
||||
def download(self, url, check_if_exists=False, filenumber=None):
|
||||
if 'tiktok.com' not in url:
|
||||
return False
|
||||
|
||||
@@ -54,11 +54,13 @@ class TiktokArchiver(Archiver):
|
||||
thumbnail_index=thumb_index, duration=info.duration, title=info.caption, timestamp=info.create.isoformat(),
|
||||
hash=hash, screenshot=screenshot)
|
||||
|
||||
except tiktok_downloader.Except.InvalidUrl:
|
||||
except tiktok_downloader.Except.InvalidUrl as e:
|
||||
status = 'Invalid URL'
|
||||
logger.warning(f'Invalid URL on {url} {e}\n{traceback.format_exc()}')
|
||||
return ArchiveResult(status=status)
|
||||
|
||||
except:
|
||||
error = traceback.format_exc()
|
||||
status = 'Other Tiktok error: ' + str(error)
|
||||
logger.warning(f'Other Tiktok error' + str(error))
|
||||
return ArchiveResult(status=status)
|
||||
|
||||
@@ -1,6 +1,5 @@
|
||||
from snscrape.modules.twitter import TwitterTweetScraper, Video, Gif, Photo
|
||||
from loguru import logger
|
||||
import requests
|
||||
from urllib.parse import urlparse
|
||||
|
||||
from .base_archiver import Archiver, ArchiveResult
|
||||
@@ -9,7 +8,8 @@ from .base_archiver import Archiver, ArchiveResult
|
||||
class TwitterArchiver(Archiver):
|
||||
name = "twitter"
|
||||
|
||||
def download(self, url, check_if_exists=False):
|
||||
def download(self, url, check_if_exists=False, filenumber=None):
|
||||
|
||||
if 'twitter.com' != self.get_netloc(url):
|
||||
return False
|
||||
|
||||
@@ -24,11 +24,14 @@ class TwitterArchiver(Archiver):
|
||||
|
||||
try:
|
||||
tweet = next(scr.get_items())
|
||||
except:
|
||||
logger.warning('wah wah')
|
||||
except Exception as ex:
|
||||
template = "TwitterArchiver cant get tweet and threw, which can happen if a media sensitive tweet. \n type: {0} occurred. \n arguments:{1!r}"
|
||||
message = template.format(type(ex).__name__, ex.args)
|
||||
logger.warning(message)
|
||||
return False
|
||||
|
||||
if tweet.media is None:
|
||||
logger.trace(f'No media found')
|
||||
return False
|
||||
|
||||
urls = []
|
||||
@@ -45,8 +48,8 @@ class TwitterArchiver(Archiver):
|
||||
else:
|
||||
logger.warning(f"Could not get media URL of {media}")
|
||||
|
||||
page_cdn, page_hash, thumbnail = self.generate_media_page(urls, url, tweet.json())
|
||||
page_cdn, page_hash, thumbnail = self.generate_media_page(urls, url, tweet.json(), filenumber)
|
||||
|
||||
screenshot = self.get_screenshot(url)
|
||||
screenshot = self.get_screenshot(url, filenumber)
|
||||
|
||||
return ArchiveResult(status="success", cdn_url=page_cdn, screenshot=screenshot, hash=page_hash, thumbnail=thumbnail, timestamp=tweet.date)
|
||||
|
||||
@@ -4,6 +4,8 @@ from bs4 import BeautifulSoup
|
||||
from storages import Storage
|
||||
from .base_archiver import Archiver, ArchiveResult
|
||||
|
||||
from loguru import logger
|
||||
|
||||
|
||||
class WaybackArchiver(Archiver):
|
||||
name = "wayback"
|
||||
@@ -12,7 +14,7 @@ class WaybackArchiver(Archiver):
|
||||
super(WaybackArchiver, self).__init__(storage, driver)
|
||||
self.seen_urls = {}
|
||||
|
||||
def download(self, url, check_if_exists=False):
|
||||
def download(self, url, check_if_exists=False, filenumber=None):
|
||||
if check_if_exists and url in self.seen_urls:
|
||||
return self.seen_urls[url]
|
||||
|
||||
@@ -25,9 +27,11 @@ class WaybackArchiver(Archiver):
|
||||
'https://web.archive.org/save/', headers=ia_headers, data={'url': url})
|
||||
|
||||
if r.status_code != 200:
|
||||
logger.warning(f"Internet archive failed with status of {r.status_code}")
|
||||
return ArchiveResult(status="Internet archive failed")
|
||||
|
||||
if 'job_id' not in r.json() and 'message' in r.json():
|
||||
logger.warning(f"Internet archive failed json \n {r.json()}")
|
||||
return ArchiveResult(status=f"Internet archive failed: {r.json()['message']}")
|
||||
|
||||
job_id = r.json()['job_id']
|
||||
@@ -71,7 +75,7 @@ class WaybackArchiver(Archiver):
|
||||
except:
|
||||
title = "Could not get title"
|
||||
|
||||
screenshot = self.get_screenshot(url)
|
||||
screenshot = self.get_screenshot(url, filenumber)
|
||||
result = ArchiveResult(status='Internet Archive fallback', cdn_url=archive_url, title=title, screenshot=screenshot)
|
||||
self.seen_urls[url] = result
|
||||
return result
|
||||
|
||||
@@ -15,7 +15,7 @@ class YoutubeDLArchiver(Archiver):
|
||||
super().__init__(storage, driver)
|
||||
self.fb_cookie = fb_cookie
|
||||
|
||||
def download(self, url, check_if_exists=False):
|
||||
def download(self, url, check_if_exists=False, filenumber=None):
|
||||
netloc = self.get_netloc(url)
|
||||
if netloc in ['facebook.com', 'www.facebook.com']:
|
||||
logger.debug('Using Facebook cookie')
|
||||
@@ -27,13 +27,17 @@ class YoutubeDLArchiver(Archiver):
|
||||
|
||||
try:
|
||||
info = ydl.extract_info(url, download=False)
|
||||
except yt_dlp.utils.DownloadError:
|
||||
# no video here
|
||||
except yt_dlp.utils.DownloadError as e:
|
||||
logger.debug(f'No video - Youtube normal control flow: {e}')
|
||||
return False
|
||||
except Exception as e:
|
||||
logger.debug(f'ytdlp exception which is normal for example a facebook page with images only will cause a IndexError: list index out of range. Exception here is: \n {e}')
|
||||
return False
|
||||
|
||||
if info.get('is_live', False):
|
||||
logger.warning("Live streaming media, not archiving now")
|
||||
return ArchiveResult(status="Streaming media")
|
||||
|
||||
if 'twitter.com' in netloc:
|
||||
if 'https://twitter.com/' in info['webpage_url']:
|
||||
logger.info('Found https://twitter.com/ in the download url from Twitter')
|
||||
@@ -41,7 +45,6 @@ class YoutubeDLArchiver(Archiver):
|
||||
logger.info('Found a linked video probably in a link in a tweet - not getting that video as there may be images in the tweet')
|
||||
return False
|
||||
|
||||
|
||||
if check_if_exists:
|
||||
if 'entries' in info:
|
||||
if len(info['entries']) > 1:
|
||||
@@ -58,6 +61,9 @@ class YoutubeDLArchiver(Archiver):
|
||||
|
||||
key = self.get_key(filename)
|
||||
|
||||
if filenumber is not None:
|
||||
key = filenumber + "/" + key
|
||||
|
||||
if self.storage.exists(key):
|
||||
status = 'already archived'
|
||||
cdn_url = self.storage.get_cdn_url(key)
|
||||
@@ -81,12 +87,19 @@ class YoutubeDLArchiver(Archiver):
|
||||
|
||||
if status != 'already archived':
|
||||
key = self.get_key(filename)
|
||||
cdn_url = self.storage.get_cdn_url(key)
|
||||
|
||||
if filenumber is not None:
|
||||
key = filenumber + "/" + key
|
||||
|
||||
self.storage.upload(filename, key)
|
||||
|
||||
# filename ='tmp/sDE-qZdi8p8.webm'
|
||||
# key ='SM0022/youtube_dl_sDE-qZdi8p8.webm'
|
||||
cdn_url = self.storage.get_cdn_url(key)
|
||||
|
||||
hash = self.get_hash(filename)
|
||||
screenshot = self.get_screenshot(url)
|
||||
screenshot = self.get_screenshot(url, filenumber)
|
||||
|
||||
|
||||
# get duration
|
||||
duration = info.get('duration')
|
||||
|
||||
Reference in New Issue
Block a user