mirror of
https://github.com/bellingcat/auto-archiver.git
synced 2026-06-08 03:18:28 +03:00
Merge branch 'dev' into oauth
This commit is contained in:
@@ -26,15 +26,14 @@ class ArchiveResult:
|
||||
screenshot: str = None
|
||||
hash: str = None
|
||||
|
||||
|
||||
class Archiver(ABC):
|
||||
HASH_ALGORITHM="SHA-256" # can be overwritten by user configs
|
||||
name = "default"
|
||||
retry_regex = r"retrying at (\d+)$"
|
||||
|
||||
def __init__(self, storage: Storage, driver, hash_algorithm):
|
||||
def __init__(self, storage: Storage, driver):
|
||||
self.storage = storage
|
||||
self.driver = driver
|
||||
self.hash_algorithm = hash_algorithm
|
||||
|
||||
def __str__(self):
|
||||
return self.__class__.__name__
|
||||
@@ -48,7 +47,6 @@ class Archiver(ABC):
|
||||
def get_netloc(self, url):
|
||||
return urlparse(url).netloc
|
||||
|
||||
# generate the html page eg SM3013/twitter__minmyatnaing13_status_1499415562937503751.html
|
||||
def generate_media_page_html(self, url, urls_info: dict, object, thumbnail=None):
|
||||
"""
|
||||
Generates an index.html page where each @urls_info is displayed
|
||||
@@ -164,12 +162,11 @@ class Archiver(ABC):
|
||||
def get_hash(self, filename):
|
||||
with open(filename, "rb") as f:
|
||||
bytes = f.read() # read entire file as bytes
|
||||
ha = self.hash_algorithm
|
||||
logger.debug(f'Hash algorithm is {ha}')
|
||||
logger.debug(f'Hash algorithm is {self.HASH_ALGORITHM}')
|
||||
|
||||
if ha == "SHA3_512": hash = hashlib.sha3_512(bytes)
|
||||
elif ha == "SHA256": hash = hashlib.sha256(bytes)
|
||||
else: raise Exception("Unknown Hash Algorithm of {ha}")
|
||||
if self.HASH_ALGORITHM == "SHA-256": hash = hashlib.sha256(bytes)
|
||||
elif self.HASH_ALGORITHM == "SHA3-512": hash = hashlib.sha3_512(bytes)
|
||||
else: raise Exception(f"Unknown Hash Algorithm of {self.HASH_ALGORITHM}")
|
||||
|
||||
return hash.hexdigest()
|
||||
|
||||
|
||||
@@ -11,9 +11,6 @@ from storages import Storage
|
||||
class TelegramArchiver(Archiver):
|
||||
name = "telegram"
|
||||
|
||||
def __init__(self, storage: Storage, driver, hash_algorithm):
|
||||
super().__init__(storage, driver, hash_algorithm)
|
||||
|
||||
def download(self, url, check_if_exists=False):
|
||||
# detect URLs that we definitely cannot handle
|
||||
if 't.me' != self.get_netloc(url):
|
||||
|
||||
@@ -15,8 +15,8 @@ class TelethonArchiver(Archiver):
|
||||
name = "telethon"
|
||||
link_pattern = re.compile(r"https:\/\/t\.me(\/c){0,1}\/(.+)\/(\d+)")
|
||||
|
||||
def __init__(self, storage: Storage, driver, config: TelethonConfig, hash_algorithm):
|
||||
super().__init__(storage, driver, hash_algorithm)
|
||||
def __init__(self, storage: Storage, driver, config: TelethonConfig):
|
||||
super().__init__(storage, driver)
|
||||
if config:
|
||||
self.client = TelegramClient("./anon", config.api_id, config.api_hash)
|
||||
self.bot_token = config.bot_token
|
||||
|
||||
@@ -15,9 +15,6 @@ class TiktokArchiver(Archiver):
|
||||
|
||||
status = 'success'
|
||||
|
||||
def __init__(self, storage: Storage, driver, hash_algorithm):
|
||||
super().__init__(storage, driver, hash_algorithm)
|
||||
|
||||
try:
|
||||
info = tiktok_downloader.info_post(url)
|
||||
key = self.get_key(f'{info.id}.mp4')
|
||||
|
||||
@@ -13,8 +13,8 @@ from .twitter_archiver import TwitterArchiver
|
||||
class TwitterApiArchiver(TwitterArchiver):
|
||||
name = "twitter_api"
|
||||
|
||||
def __init__(self, storage: Storage, driver, config: TwitterApiConfig, hash_algorithm):
|
||||
super().__init__(storage, driver, hash_algorithm)
|
||||
def __init__(self, storage: Storage, driver, config: TwitterApiConfig):
|
||||
super().__init__(storage, driver)
|
||||
|
||||
if config.bearer_token:
|
||||
self.api = Api(bearer_token=config.bearer_token)
|
||||
|
||||
@@ -5,15 +5,11 @@ from snscrape.modules.twitter import TwitterTweetScraper, Video, Gif, Photo
|
||||
|
||||
from .base_archiver import Archiver, ArchiveResult
|
||||
|
||||
from storages import Storage
|
||||
|
||||
class TwitterArchiver(Archiver):
|
||||
"""
|
||||
This Twitter Archiver uses unofficial scraping methods, and it works as
|
||||
an alternative to TwitterApiArchiver when no API credentials are provided.
|
||||
"""
|
||||
def __init__(self, storage: Storage, driver, hash_algorithm):
|
||||
super().__init__(storage, driver, hash_algorithm)
|
||||
|
||||
name = "twitter"
|
||||
link_pattern = re.compile(r"twitter.com\/(?:\#!\/)?(\w+)\/status(?:es)?\/(\d+)")
|
||||
|
||||
@@ -17,8 +17,8 @@ class VkArchiver(Archiver):
|
||||
wall_pattern = re.compile(r"(wall.{0,1}\d+_\d+)")
|
||||
photo_pattern = re.compile(r"(photo.{0,1}\d+_\d+)")
|
||||
|
||||
def __init__(self, storage: Storage, driver, config: VkConfig, hash_algorithm):
|
||||
super().__init__(storage, driver, hash_algorithm)
|
||||
def __init__(self, storage: Storage, driver, config: VkConfig):
|
||||
super().__init__(storage, driver)
|
||||
if config != None:
|
||||
self.vks = VkScraper(config.username, config.password)
|
||||
|
||||
|
||||
@@ -15,8 +15,8 @@ class WaybackArchiver(Archiver):
|
||||
"""
|
||||
name = "wayback"
|
||||
|
||||
def __init__(self, storage: Storage, driver, config: WaybackConfig, hash_algorithm):
|
||||
super(WaybackArchiver, self).__init__(storage, driver, hash_algorithm)
|
||||
def __init__(self, storage: Storage, driver, config: WaybackConfig):
|
||||
super(WaybackArchiver, self).__init__(storage, driver)
|
||||
self.config = config
|
||||
self.seen_urls = {}
|
||||
|
||||
|
||||
@@ -12,8 +12,8 @@ class YoutubeDLArchiver(Archiver):
|
||||
name = "youtube_dl"
|
||||
ydl_opts = {'outtmpl': f'{Storage.TMP_FOLDER}%(id)s.%(ext)s', 'quiet': False}
|
||||
|
||||
def __init__(self, storage: Storage, driver, fb_cookie, hash_algorithm):
|
||||
super().__init__(storage, driver, hash_algorithm)
|
||||
def __init__(self, storage: Storage, driver, fb_cookie):
|
||||
super().__init__(storage, driver)
|
||||
self.fb_cookie = fb_cookie
|
||||
|
||||
def download(self, url, check_if_exists=False):
|
||||
|
||||
@@ -104,14 +104,14 @@ def process_sheet(c: Config):
|
||||
|
||||
# order matters, first to succeed excludes remaining
|
||||
active_archivers = [
|
||||
TelethonArchiver(storage, c.webdriver, c.telegram_config, c.hash_algorithm),
|
||||
TiktokArchiver(storage, c.webdriver, c.hash_algorithm),
|
||||
TwitterApiArchiver(storage, c.webdriver, c.twitter_config, c.hash_algorithm),
|
||||
YoutubeDLArchiver(storage, c.webdriver, c.facebook_cookie,c.hash_algorithm),
|
||||
TelegramArchiver(storage, c.webdriver, c.hash_algorithm),
|
||||
TwitterArchiver(storage, c.webdriver, c.hash_algorithm),
|
||||
VkArchiver(storage, c.webdriver, c.vk_config, c.hash_algorithm),
|
||||
WaybackArchiver(storage, c.webdriver, c.wayback_config, c.hash_algorithm)
|
||||
TelethonArchiver(storage, c.webdriver, c.telegram_config),
|
||||
TiktokArchiver(storage, c.webdriver),
|
||||
TwitterApiArchiver(storage, c.webdriver, c.twitter_config),
|
||||
YoutubeDLArchiver(storage, c.webdriver, c.facebook_cookie),
|
||||
TelegramArchiver(storage, c.webdriver),
|
||||
TwitterArchiver(storage, c.webdriver),
|
||||
VkArchiver(storage, c.webdriver, c.vk_config),
|
||||
WaybackArchiver(storage, c.webdriver, c.wayback_config)
|
||||
]
|
||||
|
||||
for archiver in active_archivers:
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
|
||||
import argparse, yaml, json
|
||||
from archivers.base_archiver import Archiver
|
||||
import gspread
|
||||
from loguru import logger
|
||||
from selenium import webdriver
|
||||
@@ -81,7 +82,7 @@ class Config:
|
||||
)
|
||||
self.webdriver = "not initialized"
|
||||
|
||||
self.hash_algorithm = execution.get("hash_algorithm")
|
||||
Archiver.HASH_ALGORITHM = execution.get("hash_algorithm", Archiver.HASH_ALGORITHM)
|
||||
|
||||
# ---------------------- SECRETS - APIs and service configurations
|
||||
secrets = self.config.get("secrets", {})
|
||||
@@ -262,6 +263,7 @@ class Config:
|
||||
"storage": self.storage,
|
||||
"header": self.header,
|
||||
"check_if_exists": self.check_if_exists,
|
||||
"hash_algorithm": Archiver.HASH_ALGORITHM,
|
||||
"save_logs": self.save_logs,
|
||||
"selenium_config": asdict(self.selenium_config),
|
||||
"selenium_webdriver": self.webdriver != None,
|
||||
|
||||
@@ -91,6 +91,10 @@ execution:
|
||||
storage: s3
|
||||
# defaults to false, when true will try to avoid duplicate URL archives
|
||||
check_if_exists: true
|
||||
|
||||
# choose a hash algorithm (either SHA-256 or SHA3-512, defaults to SHA-256)
|
||||
# hash_algorithm: SHA-256
|
||||
|
||||
# optional configurations for the selenium browser that takes screenshots, these are the defaults
|
||||
selenium:
|
||||
# values under 10s might mean screenshots fail to grab screenshot
|
||||
@@ -116,7 +120,3 @@ execution:
|
||||
screenshot: screenshot
|
||||
hash: hash
|
||||
|
||||
# Must be either SHA256 or SHA3_512
|
||||
hash_algorithm: SHA3_512
|
||||
# hash_algorithm: SHA256
|
||||
|
||||
|
||||
Reference in New Issue
Block a user