mirror of
https://github.com/bellingcat/auto-archiver.git
synced 2026-06-08 03:18:28 +03:00
cleanup and docs
This commit is contained in:
20
README.md
20
README.md
@@ -68,3 +68,23 @@ To make it easier to set up new auto-archiver sheets, the auto-auto-archiver wil
|
||||
|
||||

|
||||
|
||||
# Code structure
|
||||
Code is split into functional concepts:
|
||||
1. [Archivers](archivers/) - receive a URL that they try to archive
|
||||
2. [Storages](storages/) - they deal with where the archived files go
|
||||
3. utilities
|
||||
1. [GWorksheet](gworksheet.py) - facilitates some of the reading/writing tasks for a Google Worksheet
|
||||
|
||||
### Current Archivers
|
||||
```mermaid
|
||||
graph TD
|
||||
A(Archiver) -->|parent of| B(TelegramArchiver)
|
||||
A -->|parent of| C(TikTokArchiver)
|
||||
A -->|parent of| D(YoutubeDLArchiver)
|
||||
A -->|parent of| E(WaybackArchiver)
|
||||
```
|
||||
### Current Storages
|
||||
```mermaid
|
||||
graph TD
|
||||
A(BaseStorage) -->|parent of| B(S3Storage)
|
||||
```
|
||||
|
||||
@@ -3,6 +3,7 @@ import ffmpeg
|
||||
import datetime
|
||||
from dataclasses import dataclass
|
||||
from abc import ABC, abstractmethod
|
||||
from urllib.parse import urlparse
|
||||
|
||||
from storages import Storage
|
||||
|
||||
@@ -30,6 +31,9 @@ class Archiver(ABC):
|
||||
@abstractmethod
|
||||
def download(self, url, check_if_exists=False): pass
|
||||
|
||||
def get_netloc(self, url):
|
||||
return urlparse(url).netloc
|
||||
|
||||
def get_key(self, filename):
|
||||
"""
|
||||
returns a key in the format "[archiverName]_[filename]" includes extension
|
||||
@@ -40,9 +44,12 @@ class Archiver(ABC):
|
||||
_id = _id.replace('unknown_video', 'jpg')
|
||||
return f'{self.name}_{_id}{extension}'
|
||||
|
||||
def get_thumbnails(self, filename, duration=None):
|
||||
if not os.path.exists(filename.split('.')[0]):
|
||||
os.mkdir(filename.split('.')[0])
|
||||
def get_thumbnails(self, filename, key, duration=None):
|
||||
thumbnails_folder = filename.split('.')[0] + '/'
|
||||
key_folder = key.split('.')[0] + '/'
|
||||
|
||||
if not os.path.exists(thumbnails_folder):
|
||||
os.mkdir(thumbnails_folder)
|
||||
|
||||
fps = 0.5
|
||||
if duration is not None:
|
||||
@@ -57,15 +64,14 @@ class Archiver(ABC):
|
||||
|
||||
stream = ffmpeg.input(filename)
|
||||
stream = ffmpeg.filter(stream, 'fps', fps=fps).filter('scale', 512, -1)
|
||||
stream.output(filename.split('.')[0] + '/out%d.jpg').run()
|
||||
stream.output(thumbnails_folder + 'out%d.jpg').run()
|
||||
|
||||
thumbnails = os.listdir(filename.split('.')[0] + '/')
|
||||
thumbnails = os.listdir(thumbnails_folder)
|
||||
cdn_urls = []
|
||||
|
||||
for fname in thumbnails:
|
||||
if fname[-3:] == 'jpg':
|
||||
thumbnail_filename = filename.split('.')[0] + '/' + fname
|
||||
key = filename.split('/')[1].split('.')[0] + '/' + fname
|
||||
thumbnail_filename = thumbnails_folder + fname
|
||||
key = key_folder + fname
|
||||
|
||||
cdn_url = self.storage.get_cdn_url(key)
|
||||
|
||||
@@ -86,12 +92,12 @@ class Archiver(ABC):
|
||||
index_page += f'<img src="{t}" />'
|
||||
|
||||
index_page += f"</body></html>"
|
||||
index_fname = filename.split('.')[0] + '/index.html'
|
||||
index_fname = thumbnails_folder + 'index.html'
|
||||
|
||||
with open(index_fname, 'w') as f:
|
||||
f.write(index_page)
|
||||
|
||||
thumb_index = filename.split('/')[1].split('.')[0] + '/index.html'
|
||||
thumb_index = key_folder + 'index.html'
|
||||
|
||||
self.storage.upload(index_fname, thumb_index, extra_args={'ACL': 'public-read', 'ContentType': 'text/html'})
|
||||
|
||||
|
||||
@@ -10,7 +10,7 @@ class TelegramArchiver(Archiver):
|
||||
|
||||
def download(self, url, check_if_exists=False):
|
||||
# detect URLs that we definitely cannot handle
|
||||
if 'http://t.me/' not in url and 'https://t.me/' not in url:
|
||||
if 't.me' != self.get_netloc(url):
|
||||
return False
|
||||
|
||||
headers = {
|
||||
@@ -20,7 +20,7 @@ class TelegramArchiver(Archiver):
|
||||
|
||||
original_url = url
|
||||
|
||||
# TODO: check if we can do this more resilient to user-input
|
||||
# TODO: check if we can do this more resilient to variable URLs
|
||||
if url[-8:] != "?embed=1":
|
||||
url += "?embed=1"
|
||||
|
||||
@@ -32,8 +32,8 @@ class TelegramArchiver(Archiver):
|
||||
return False # could not find video
|
||||
|
||||
video_url = video.get('src')
|
||||
key = video_url.split('/')[-1].split('?')[0]
|
||||
key = self.get_key(key)
|
||||
video_id = video_url.split('/')[-1].split('?')[0]
|
||||
key = self.get_key(video_id)
|
||||
|
||||
filename = 'tmp/' + key
|
||||
|
||||
@@ -60,7 +60,7 @@ class TelegramArchiver(Archiver):
|
||||
duration = float(duration)
|
||||
|
||||
# process thumbnails
|
||||
key_thumb, thumb_index = self.get_thumbnails(filename, duration=duration)
|
||||
key_thumb, thumb_index = self.get_thumbnails(filename, key, duration=duration)
|
||||
os.remove(filename)
|
||||
|
||||
return ArchiveResult(status=status, cdn_url=cdn_url, thumbnail=key_thumb, thumbnail_index=thumb_index,
|
||||
|
||||
@@ -37,8 +37,9 @@ class TiktokArchiver(Archiver):
|
||||
self.storage.upload(filename, key)
|
||||
|
||||
try:
|
||||
key_thumb, thumb_index = self.get_thumbnails(filename, duration=info.duration)
|
||||
except:
|
||||
key_thumb, thumb_index = self.get_thumbnails(filename, key, duration=info.duration)
|
||||
except Exception as e:
|
||||
logger.error(e)
|
||||
key_thumb = ''
|
||||
thumb_index = 'error creating thumbnails'
|
||||
|
||||
|
||||
@@ -9,14 +9,15 @@ from .base_archiver import Archiver, ArchiveResult
|
||||
|
||||
class YoutubeDLArchiver(Archiver):
|
||||
name = "yotube_dl"
|
||||
ydl_opts = {'outtmpl': 'tmp/%(id)s.%(ext)s', 'quiet': False}
|
||||
|
||||
def download(self, url, check_if_exists=False):
|
||||
ydl_opts = {'outtmpl': 'tmp/%(id)s.%(ext)s', 'quiet': False}
|
||||
if (url[0:21] == 'https://facebook.com/' or url[0:25] == 'https://wwww.facebook.com/') and os.getenv('FB_COOKIE'):
|
||||
netloc = self.get_netloc(url)
|
||||
if netloc in ['facebook.com', 'wwww.facebook.com'] and os.getenv('FB_COOKIE'):
|
||||
logger.info('Using Facebook cookie')
|
||||
youtube_dl.utils.std_headers['cookie'] = os.getenv('FB_COOKIE')
|
||||
|
||||
ydl = youtube_dl.YoutubeDL(ydl_opts)
|
||||
ydl = youtube_dl.YoutubeDL(YoutubeDLArchiver.ydl_opts)
|
||||
cdn_url = None
|
||||
status = 'success'
|
||||
|
||||
@@ -26,7 +27,7 @@ class YoutubeDLArchiver(Archiver):
|
||||
# no video here
|
||||
return False
|
||||
|
||||
if 'is_live' in info and info['is_live']:
|
||||
if info.get('is_live', False):
|
||||
logger.warning("Live streaming media, not archiving now")
|
||||
return ArchiveResult(status="Streaming media")
|
||||
|
||||
@@ -74,11 +75,11 @@ class YoutubeDLArchiver(Archiver):
|
||||
self.storage.upload(filename, key)
|
||||
|
||||
# get duration
|
||||
duration = info['duration'] if 'duration' in info else None
|
||||
duration = info.get('duration')
|
||||
|
||||
# get thumbnails
|
||||
try:
|
||||
key_thumb, thumb_index = self.get_thumbnails(filename, duration=duration)
|
||||
key_thumb, thumb_index = self.get_thumbnails(filename, key, duration=duration)
|
||||
except:
|
||||
key_thumb = ''
|
||||
thumb_index = 'Could not generate thumbnails'
|
||||
|
||||
@@ -63,13 +63,13 @@ class GWorksheet:
|
||||
"""
|
||||
cell_updates = [
|
||||
{
|
||||
'range': self.to_a1(row, self._col_index(col) + 1),
|
||||
'range': self.to_a1(row, col),
|
||||
'values': [[val]]
|
||||
}
|
||||
for row, col, val in cell_updates
|
||||
]
|
||||
self.wks.batch_update(cell_updates, value_input_option='USER_ENTERED')
|
||||
|
||||
def to_a1(self, row: int, col: int):
|
||||
# row, col are 1-based
|
||||
return utils.rowcol_to_a1(row, col)
|
||||
def to_a1(self, row: int, col: str):
|
||||
# row is 1-based
|
||||
return utils.rowcol_to_a1(row, self._col_index(col) + 1)
|
||||
|
||||
Reference in New Issue
Block a user