mirror of
https://github.com/bellingcat/auto-archiver.git
synced 2026-06-12 05:08:28 +03:00
Merge main
This commit is contained in:
@@ -1 +1 @@
|
||||
from atlos_db import AtlosDb
|
||||
from .atlos_db import AtlosDb
|
||||
1
src/auto_archiver/modules/atlos_storage/__init__.py
Normal file
1
src/auto_archiver/modules/atlos_storage/__init__.py
Normal file
@@ -0,0 +1 @@
|
||||
from .atlos_storage import AtlosStorage
|
||||
@@ -281,7 +281,7 @@ class GenericExtractor(Extractor):
|
||||
# set up auth
|
||||
auth = self.auth_for_site(url, extract_cookies=False)
|
||||
|
||||
# order of importance: username/pasword -> api_key -> cookie -> cookie_from_browser -> cookies_file
|
||||
# order of importance: username/pasword -> api_key -> cookie -> cookies_from_browser -> cookies_file
|
||||
if auth:
|
||||
if 'username' in auth and 'password' in auth:
|
||||
logger.debug(f'Using provided auth username and password for {url}')
|
||||
@@ -290,7 +290,7 @@ class GenericExtractor(Extractor):
|
||||
elif 'cookie' in auth:
|
||||
logger.debug(f'Using provided auth cookie for {url}')
|
||||
yt_dlp.utils.std_headers['cookie'] = auth['cookie']
|
||||
elif 'cookie_from_browser' in auth:
|
||||
elif 'cookies_from_browser' in auth:
|
||||
logger.debug(f'Using extracted cookies from browser {auth["cookies_from_browser"]} for {url}')
|
||||
ydl_options['cookiesfrombrowser'] = auth['cookies_from_browser']
|
||||
elif 'cookies_file' in auth:
|
||||
|
||||
@@ -10,7 +10,6 @@ from auto_archiver.version import __version__
|
||||
from auto_archiver.core import Metadata, Media
|
||||
from auto_archiver.core import Formatter
|
||||
from auto_archiver.utils.misc import random_str
|
||||
from auto_archiver.core.module import get_module
|
||||
|
||||
class HtmlFormatter(Formatter):
|
||||
environment: Environment = None
|
||||
@@ -50,7 +49,7 @@ class HtmlFormatter(Formatter):
|
||||
final_media = Media(filename=html_path, _mimetype="text/html")
|
||||
|
||||
# get the already instantiated hash_enricher module
|
||||
he = get_module('hash_enricher', self.config)
|
||||
he = self.module_factory.get_module('hash_enricher', self.config)
|
||||
if len(hd := he.calculate_hash(final_media.filename)):
|
||||
final_media.set("hash", f"{he.algorithm}:{hd}")
|
||||
|
||||
|
||||
@@ -77,13 +77,14 @@ class InstagramTbotExtractor(Extractor):
|
||||
chat, since_id = self._send_url_to_bot(url)
|
||||
message = self._process_messages(chat, since_id, tmp_dir, result)
|
||||
|
||||
# This may be outdated and replaced by the below message, but keeping until confirmed
|
||||
if "You must enter a URL to a post" in message:
|
||||
logger.debug(f"invalid link {url=} for {self.name}: {message}")
|
||||
return False
|
||||
# # TODO: It currently returns this as a success - is that intentional?
|
||||
# if "Media not found or unavailable" in message:
|
||||
# logger.debug(f"invalid link {url=} for {self.name}: {message}")
|
||||
# return False
|
||||
|
||||
if "Media not found or unavailable" in message:
|
||||
logger.debug(f"No media found for link {url=} for {self.name}: {message}")
|
||||
return False
|
||||
|
||||
if message:
|
||||
result.set_content(message).set_title(message[:128])
|
||||
|
||||
@@ -11,6 +11,10 @@ from auto_archiver.core import Media, Metadata
|
||||
|
||||
class ScreenshotEnricher(Enricher):
|
||||
|
||||
def __init__(self, webdriver_factory=None):
|
||||
super().__init__()
|
||||
self.webdriver_factory = webdriver_factory or Webdriver
|
||||
|
||||
def enrich(self, to_enrich: Metadata) -> None:
|
||||
url = to_enrich.get_url()
|
||||
|
||||
@@ -20,7 +24,8 @@ class ScreenshotEnricher(Enricher):
|
||||
|
||||
logger.debug(f"Enriching screenshot for {url=}")
|
||||
auth = self.auth_for_site(url)
|
||||
with Webdriver(self.width, self.height, self.timeout, facebook_accept_cookies='facebook.com' in url,
|
||||
with self.webdriver_factory(
|
||||
self.width, self.height, self.timeout, facebook_accept_cookies='facebook.com' in url,
|
||||
http_proxy=self.http_proxy, print_options=self.print_options, auth=auth) as driver:
|
||||
try:
|
||||
driver.get(url)
|
||||
@@ -38,3 +43,4 @@ class ScreenshotEnricher(Enricher):
|
||||
logger.info("TimeoutException loading page for screenshot")
|
||||
except Exception as e:
|
||||
logger.error(f"Got error while loading webdriver for screenshot enricher: {e}")
|
||||
|
||||
|
||||
@@ -7,8 +7,12 @@
|
||||
"bin": ["ffmpeg"]
|
||||
},
|
||||
"configs": {
|
||||
"thumbnails_per_minute": {"default": 60, "help": "how many thumbnails to generate per minute of video, can be limited by max_thumbnails"},
|
||||
"max_thumbnails": {"default": 16, "help": "limit the number of thumbnails to generate per video, 0 means no limit"},
|
||||
"thumbnails_per_minute": {"default": 60,
|
||||
"type": "int",
|
||||
"help": "how many thumbnails to generate per minute of video, can be limited by max_thumbnails"},
|
||||
"max_thumbnails": {"default": 16,
|
||||
"type": "int",
|
||||
"help": "limit the number of thumbnails to generate per video, 0 means no limit"},
|
||||
},
|
||||
"description": """
|
||||
Generates thumbnails for video files to provide visual previews.
|
||||
|
||||
@@ -42,7 +42,7 @@ class ThumbnailEnricher(Enricher):
|
||||
logger.error(f"error getting duration of video {m.filename}: {e}")
|
||||
return
|
||||
|
||||
num_thumbs = int(min(max(1, duration * self.thumbnails_per_minute), self.max_thumbnails))
|
||||
num_thumbs = int(min(max(1, (duration / 60) * self.thumbnails_per_minute), self.max_thumbnails))
|
||||
timestamps = [duration / (num_thumbs + 1) * i for i in range(1, num_thumbs + 1)]
|
||||
|
||||
thumbnails_media = []
|
||||
|
||||
@@ -4,7 +4,6 @@ from loguru import logger
|
||||
|
||||
from auto_archiver.core import Enricher
|
||||
from auto_archiver.core import Metadata, Media
|
||||
from auto_archiver.core.module import get_module
|
||||
|
||||
class WhisperEnricher(Enricher):
|
||||
"""
|
||||
@@ -15,7 +14,7 @@ class WhisperEnricher(Enricher):
|
||||
|
||||
def setup(self) -> None:
|
||||
self.stores = self.config['steps']['storages']
|
||||
self.s3 = get_module("s3_storage", self.config)
|
||||
self.s3 = self.module_factory.get_module("s3_storage", self.config)
|
||||
if not "s3_storage" in self.stores:
|
||||
logger.error("WhisperEnricher: To use the WhisperEnricher you need to use S3Storage so files are accessible publicly to the whisper service being called.")
|
||||
return
|
||||
@@ -29,8 +28,7 @@ class WhisperEnricher(Enricher):
|
||||
job_results = {}
|
||||
for i, m in enumerate(to_enrich.media):
|
||||
if m.is_video() or m.is_audio():
|
||||
# TODO: this used to pass all storage items to store now
|
||||
# Now only passing S3, the rest will get added later in the usual order (?)
|
||||
# Only storing S3, the rest will get added later in the usual order (?)
|
||||
m.store(url=url, metadata=to_enrich, storages=[self.s3])
|
||||
try:
|
||||
job_id = self.submit_job(m)
|
||||
|
||||
Reference in New Issue
Block a user