mirror of
https://github.com/bellingcat/auto-archiver.git
synced 2026-06-12 21:28:29 +03:00
cleanup and docs
This commit is contained in:
20
README.md
20
README.md
@@ -68,3 +68,23 @@ To make it easier to set up new auto-archiver sheets, the auto-auto-archiver wil
|
|||||||
|
|
||||||

|

|
||||||
|
|
||||||
|
# Code structure
|
||||||
|
Code is split into functional concepts:
|
||||||
|
1. [Archivers](archivers/) - receive a URL that they try to archive
|
||||||
|
2. [Storages](storages/) - they deal with where the archived files go
|
||||||
|
3. utilities
|
||||||
|
1. [GWorksheet](gworksheet.py) - facilitates some of the reading/writing tasks for a Google Worksheet
|
||||||
|
|
||||||
|
### Current Archivers
|
||||||
|
```mermaid
|
||||||
|
graph TD
|
||||||
|
A(Archiver) -->|parent of| B(TelegramArchiver)
|
||||||
|
A -->|parent of| C(TikTokArchiver)
|
||||||
|
A -->|parent of| D(YoutubeDLArchiver)
|
||||||
|
A -->|parent of| E(WaybackArchiver)
|
||||||
|
```
|
||||||
|
### Current Storages
|
||||||
|
```mermaid
|
||||||
|
graph TD
|
||||||
|
A(BaseStorage) -->|parent of| B(S3Storage)
|
||||||
|
```
|
||||||
|
|||||||
@@ -3,6 +3,7 @@ import ffmpeg
|
|||||||
import datetime
|
import datetime
|
||||||
from dataclasses import dataclass
|
from dataclasses import dataclass
|
||||||
from abc import ABC, abstractmethod
|
from abc import ABC, abstractmethod
|
||||||
|
from urllib.parse import urlparse
|
||||||
|
|
||||||
from storages import Storage
|
from storages import Storage
|
||||||
|
|
||||||
@@ -30,6 +31,9 @@ class Archiver(ABC):
|
|||||||
@abstractmethod
|
@abstractmethod
|
||||||
def download(self, url, check_if_exists=False): pass
|
def download(self, url, check_if_exists=False): pass
|
||||||
|
|
||||||
|
def get_netloc(self, url):
|
||||||
|
return urlparse(url).netloc
|
||||||
|
|
||||||
def get_key(self, filename):
|
def get_key(self, filename):
|
||||||
"""
|
"""
|
||||||
returns a key in the format "[archiverName]_[filename]" includes extension
|
returns a key in the format "[archiverName]_[filename]" includes extension
|
||||||
@@ -40,9 +44,12 @@ class Archiver(ABC):
|
|||||||
_id = _id.replace('unknown_video', 'jpg')
|
_id = _id.replace('unknown_video', 'jpg')
|
||||||
return f'{self.name}_{_id}{extension}'
|
return f'{self.name}_{_id}{extension}'
|
||||||
|
|
||||||
def get_thumbnails(self, filename, duration=None):
|
def get_thumbnails(self, filename, key, duration=None):
|
||||||
if not os.path.exists(filename.split('.')[0]):
|
thumbnails_folder = filename.split('.')[0] + '/'
|
||||||
os.mkdir(filename.split('.')[0])
|
key_folder = key.split('.')[0] + '/'
|
||||||
|
|
||||||
|
if not os.path.exists(thumbnails_folder):
|
||||||
|
os.mkdir(thumbnails_folder)
|
||||||
|
|
||||||
fps = 0.5
|
fps = 0.5
|
||||||
if duration is not None:
|
if duration is not None:
|
||||||
@@ -57,15 +64,14 @@ class Archiver(ABC):
|
|||||||
|
|
||||||
stream = ffmpeg.input(filename)
|
stream = ffmpeg.input(filename)
|
||||||
stream = ffmpeg.filter(stream, 'fps', fps=fps).filter('scale', 512, -1)
|
stream = ffmpeg.filter(stream, 'fps', fps=fps).filter('scale', 512, -1)
|
||||||
stream.output(filename.split('.')[0] + '/out%d.jpg').run()
|
stream.output(thumbnails_folder + 'out%d.jpg').run()
|
||||||
|
|
||||||
thumbnails = os.listdir(filename.split('.')[0] + '/')
|
thumbnails = os.listdir(thumbnails_folder)
|
||||||
cdn_urls = []
|
cdn_urls = []
|
||||||
|
|
||||||
for fname in thumbnails:
|
for fname in thumbnails:
|
||||||
if fname[-3:] == 'jpg':
|
if fname[-3:] == 'jpg':
|
||||||
thumbnail_filename = filename.split('.')[0] + '/' + fname
|
thumbnail_filename = thumbnails_folder + fname
|
||||||
key = filename.split('/')[1].split('.')[0] + '/' + fname
|
key = key_folder + fname
|
||||||
|
|
||||||
cdn_url = self.storage.get_cdn_url(key)
|
cdn_url = self.storage.get_cdn_url(key)
|
||||||
|
|
||||||
@@ -86,12 +92,12 @@ class Archiver(ABC):
|
|||||||
index_page += f'<img src="{t}" />'
|
index_page += f'<img src="{t}" />'
|
||||||
|
|
||||||
index_page += f"</body></html>"
|
index_page += f"</body></html>"
|
||||||
index_fname = filename.split('.')[0] + '/index.html'
|
index_fname = thumbnails_folder + 'index.html'
|
||||||
|
|
||||||
with open(index_fname, 'w') as f:
|
with open(index_fname, 'w') as f:
|
||||||
f.write(index_page)
|
f.write(index_page)
|
||||||
|
|
||||||
thumb_index = filename.split('/')[1].split('.')[0] + '/index.html'
|
thumb_index = key_folder + 'index.html'
|
||||||
|
|
||||||
self.storage.upload(index_fname, thumb_index, extra_args={'ACL': 'public-read', 'ContentType': 'text/html'})
|
self.storage.upload(index_fname, thumb_index, extra_args={'ACL': 'public-read', 'ContentType': 'text/html'})
|
||||||
|
|
||||||
|
|||||||
@@ -10,7 +10,7 @@ class TelegramArchiver(Archiver):
|
|||||||
|
|
||||||
def download(self, url, check_if_exists=False):
|
def download(self, url, check_if_exists=False):
|
||||||
# detect URLs that we definitely cannot handle
|
# detect URLs that we definitely cannot handle
|
||||||
if 'http://t.me/' not in url and 'https://t.me/' not in url:
|
if 't.me' != self.get_netloc(url):
|
||||||
return False
|
return False
|
||||||
|
|
||||||
headers = {
|
headers = {
|
||||||
@@ -20,7 +20,7 @@ class TelegramArchiver(Archiver):
|
|||||||
|
|
||||||
original_url = url
|
original_url = url
|
||||||
|
|
||||||
# TODO: check if we can do this more resilient to user-input
|
# TODO: check if we can do this more resilient to variable URLs
|
||||||
if url[-8:] != "?embed=1":
|
if url[-8:] != "?embed=1":
|
||||||
url += "?embed=1"
|
url += "?embed=1"
|
||||||
|
|
||||||
@@ -32,8 +32,8 @@ class TelegramArchiver(Archiver):
|
|||||||
return False # could not find video
|
return False # could not find video
|
||||||
|
|
||||||
video_url = video.get('src')
|
video_url = video.get('src')
|
||||||
key = video_url.split('/')[-1].split('?')[0]
|
video_id = video_url.split('/')[-1].split('?')[0]
|
||||||
key = self.get_key(key)
|
key = self.get_key(video_id)
|
||||||
|
|
||||||
filename = 'tmp/' + key
|
filename = 'tmp/' + key
|
||||||
|
|
||||||
@@ -60,7 +60,7 @@ class TelegramArchiver(Archiver):
|
|||||||
duration = float(duration)
|
duration = float(duration)
|
||||||
|
|
||||||
# process thumbnails
|
# process thumbnails
|
||||||
key_thumb, thumb_index = self.get_thumbnails(filename, duration=duration)
|
key_thumb, thumb_index = self.get_thumbnails(filename, key, duration=duration)
|
||||||
os.remove(filename)
|
os.remove(filename)
|
||||||
|
|
||||||
return ArchiveResult(status=status, cdn_url=cdn_url, thumbnail=key_thumb, thumbnail_index=thumb_index,
|
return ArchiveResult(status=status, cdn_url=cdn_url, thumbnail=key_thumb, thumbnail_index=thumb_index,
|
||||||
|
|||||||
@@ -37,8 +37,9 @@ class TiktokArchiver(Archiver):
|
|||||||
self.storage.upload(filename, key)
|
self.storage.upload(filename, key)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
key_thumb, thumb_index = self.get_thumbnails(filename, duration=info.duration)
|
key_thumb, thumb_index = self.get_thumbnails(filename, key, duration=info.duration)
|
||||||
except:
|
except Exception as e:
|
||||||
|
logger.error(e)
|
||||||
key_thumb = ''
|
key_thumb = ''
|
||||||
thumb_index = 'error creating thumbnails'
|
thumb_index = 'error creating thumbnails'
|
||||||
|
|
||||||
|
|||||||
@@ -9,14 +9,15 @@ from .base_archiver import Archiver, ArchiveResult
|
|||||||
|
|
||||||
class YoutubeDLArchiver(Archiver):
|
class YoutubeDLArchiver(Archiver):
|
||||||
name = "yotube_dl"
|
name = "yotube_dl"
|
||||||
|
ydl_opts = {'outtmpl': 'tmp/%(id)s.%(ext)s', 'quiet': False}
|
||||||
|
|
||||||
def download(self, url, check_if_exists=False):
|
def download(self, url, check_if_exists=False):
|
||||||
ydl_opts = {'outtmpl': 'tmp/%(id)s.%(ext)s', 'quiet': False}
|
netloc = self.get_netloc(url)
|
||||||
if (url[0:21] == 'https://facebook.com/' or url[0:25] == 'https://wwww.facebook.com/') and os.getenv('FB_COOKIE'):
|
if netloc in ['facebook.com', 'wwww.facebook.com'] and os.getenv('FB_COOKIE'):
|
||||||
logger.info('Using Facebook cookie')
|
logger.info('Using Facebook cookie')
|
||||||
youtube_dl.utils.std_headers['cookie'] = os.getenv('FB_COOKIE')
|
youtube_dl.utils.std_headers['cookie'] = os.getenv('FB_COOKIE')
|
||||||
|
|
||||||
ydl = youtube_dl.YoutubeDL(ydl_opts)
|
ydl = youtube_dl.YoutubeDL(YoutubeDLArchiver.ydl_opts)
|
||||||
cdn_url = None
|
cdn_url = None
|
||||||
status = 'success'
|
status = 'success'
|
||||||
|
|
||||||
@@ -26,7 +27,7 @@ class YoutubeDLArchiver(Archiver):
|
|||||||
# no video here
|
# no video here
|
||||||
return False
|
return False
|
||||||
|
|
||||||
if 'is_live' in info and info['is_live']:
|
if info.get('is_live', False):
|
||||||
logger.warning("Live streaming media, not archiving now")
|
logger.warning("Live streaming media, not archiving now")
|
||||||
return ArchiveResult(status="Streaming media")
|
return ArchiveResult(status="Streaming media")
|
||||||
|
|
||||||
@@ -74,11 +75,11 @@ class YoutubeDLArchiver(Archiver):
|
|||||||
self.storage.upload(filename, key)
|
self.storage.upload(filename, key)
|
||||||
|
|
||||||
# get duration
|
# get duration
|
||||||
duration = info['duration'] if 'duration' in info else None
|
duration = info.get('duration')
|
||||||
|
|
||||||
# get thumbnails
|
# get thumbnails
|
||||||
try:
|
try:
|
||||||
key_thumb, thumb_index = self.get_thumbnails(filename, duration=duration)
|
key_thumb, thumb_index = self.get_thumbnails(filename, key, duration=duration)
|
||||||
except:
|
except:
|
||||||
key_thumb = ''
|
key_thumb = ''
|
||||||
thumb_index = 'Could not generate thumbnails'
|
thumb_index = 'Could not generate thumbnails'
|
||||||
|
|||||||
@@ -63,13 +63,13 @@ class GWorksheet:
|
|||||||
"""
|
"""
|
||||||
cell_updates = [
|
cell_updates = [
|
||||||
{
|
{
|
||||||
'range': self.to_a1(row, self._col_index(col) + 1),
|
'range': self.to_a1(row, col),
|
||||||
'values': [[val]]
|
'values': [[val]]
|
||||||
}
|
}
|
||||||
for row, col, val in cell_updates
|
for row, col, val in cell_updates
|
||||||
]
|
]
|
||||||
self.wks.batch_update(cell_updates, value_input_option='USER_ENTERED')
|
self.wks.batch_update(cell_updates, value_input_option='USER_ENTERED')
|
||||||
|
|
||||||
def to_a1(self, row: int, col: int):
|
def to_a1(self, row: int, col: str):
|
||||||
# row, col are 1-based
|
# row is 1-based
|
||||||
return utils.rowcol_to_a1(row, col)
|
return utils.rowcol_to_a1(row, self._col_index(col) + 1)
|
||||||
|
|||||||
Reference in New Issue
Block a user