Compare commits

...

11 Commits

Author SHA1 Message Date
msramalho
0ecbed0df0 Bump version to v0.5.6 for release 2023-04-18 18:49:08 +01:00
msramalho
69bcfea2eb to_json fix 2023-04-18 18:48:51 +01:00
msramalho
2e2e695444 whisper enricher 2023-03-23 18:50:37 +00:00
msramalho
493055a8d9 cleanup 2023-03-23 18:50:30 +00:00
msramalho
6f6eb2db7a Archiving Context refactor complete 2023-03-23 14:28:45 +00:00
msramalho
906ed0f6e0 creating global context and refactoring tmp_dir logic 2023-03-23 11:17:38 +00:00
msramalho
39818e648a Bump version to v0.4.5 for release 2023-03-16 15:05:42 +00:00
Miguel Sozinho Ramalho
2bbf534d67 Merge pull request #72 from milesmcc/patch-1
Fix hash enricher for flatfile output (closes #71)
2023-03-16 15:04:55 +00:00
R. Miles McCain
6be7536fad Fix hash enricher for flatfile output (closes #71) 2023-03-14 13:37:54 -07:00
msramalho
0654e8c5c6 hash calculation in chunks to avoid exhausting RAM 2023-03-10 11:34:29 +00:00
msramalho
0e3c427371 Bump version to v0.4.3 for release 2023-02-27 10:30:06 +01:00
25 changed files with 332 additions and 109 deletions

View File

@@ -3,8 +3,8 @@ from abc import abstractmethod
from dataclasses import dataclass
import os
import mimetypes, requests
from ..core import Metadata
from ..core import Step
from ..core import Metadata, Step, ArchivingContext
@dataclass
@@ -51,7 +51,7 @@ class Archiver(Step):
if len(to_filename) > 64:
to_filename = to_filename[-64:]
if item:
to_filename = os.path.join(item.get_tmp_dir(), to_filename)
to_filename = os.path.join(ArchivingContext.get_tmp_dir(), to_filename)
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36'
}

View File

@@ -4,7 +4,7 @@ from loguru import logger
import time, os
from sqlite3 import OperationalError
from . import Archiver
from ..core import Metadata, Media
from ..core import Metadata, Media, ArchivingContext
class InstagramTbotArchiver(Archiver):
@@ -44,7 +44,7 @@ class InstagramTbotArchiver(Archiver):
if not "instagram.com" in url: return False
result = Metadata()
tmp_dir = item.get_tmp_dir()
tmp_dir = ArchivingContext.get_tmp_dir()
with self.client.start():
chat = self.client.get_entity("instagram_load_bot")
since_id = self.client.send_message(entity=chat, message=url).id

View File

@@ -8,7 +8,7 @@ from tqdm import tqdm
import re, time, json, os
from . import Archiver
from ..core import Metadata, Media
from ..core import Metadata, Media, ArchivingContext
class TelethonArchiver(Archiver):
@@ -128,7 +128,7 @@ class TelethonArchiver(Archiver):
media_posts = self._get_media_posts_in_group(chat, post)
logger.debug(f'got {len(media_posts)=} for {url=}')
tmp_dir = item.get_tmp_dir()
tmp_dir = ArchivingContext.get_tmp_dir()
group_id = post.grouped_id if post.grouped_id is not None else post.id
title = post.message

View File

@@ -3,7 +3,7 @@ import tiktok_downloader
from loguru import logger
from . import Archiver
from ..core import Metadata, Media
from ..core import Metadata, Media, ArchivingContext
class TiktokArchiver(Archiver):
@@ -41,7 +41,7 @@ class TiktokArchiver(Archiver):
logger.warning(f'Other Tiktok error {error}')
try:
filename = os.path.join(item.get_tmp_dir(), f'{str(uuid.uuid4())[0:8]}.mp4')
filename = os.path.join(ArchivingContext.get_tmp_dir(), f'{str(uuid.uuid4())[0:8]}.mp4')
tiktok_media = tiktok_downloader.snaptik(url).get_media()
if len(tiktok_media) <= 0:

View File

@@ -3,7 +3,7 @@ from vk_url_scraper import VkScraper
from ..utils.misc import dump_payload
from . import Archiver
from ..core import Metadata, Media
from ..core import Metadata, Media, ArchivingContext
class VkArchiver(Archiver):
@@ -50,7 +50,7 @@ class VkArchiver(Archiver):
result.set_content(dump_payload(vk_scrapes))
filenames = self.vks.download_media(vk_scrapes, item.get_tmp_dir())
filenames = self.vks.download_media(vk_scrapes, ArchivingContext.get_tmp_dir())
for filename in filenames:
result.add_media(Media(filename))

View File

@@ -2,7 +2,7 @@ import datetime, os, yt_dlp
from loguru import logger
from . import Archiver
from ..core import Metadata, Media
from ..core import Metadata, Media, ArchivingContext
class YoutubeDLArchiver(Archiver):
@@ -25,7 +25,7 @@ class YoutubeDLArchiver(Archiver):
logger.debug('Using Facebook cookie')
yt_dlp.utils.std_headers['cookie'] = self.facebook_cookie
ydl = yt_dlp.YoutubeDL({'outtmpl': os.path.join(item.get_tmp_dir(), f'%(id)s.%(ext)s'), 'quiet': False})
ydl = yt_dlp.YoutubeDL({'outtmpl': os.path.join(ArchivingContext.get_tmp_dir(), f'%(id)s.%(ext)s'), 'quiet': False})
try:
# don'd download since it can be a live stream

View File

@@ -1,6 +1,7 @@
from .media import Media
from .metadata import Metadata
from .media import Media
from .step import Step
from .context import ArchivingContext
# cannot import ArchivingOrchestrator/Config to avoid circular dep
# from .orchestrator import ArchivingOrchestrator

View File

@@ -0,0 +1,53 @@
from loguru import logger
class ArchivingContext:
"""
Singleton context class.
ArchivingContext._get_instance() to retrieve it if needed
otherwise just
ArchivingContext.set(key, value)
and
ArchivingContext.get(key, default)
When reset is called, all values are cleared EXCEPT if they were .set(keep_on_reset=True)
reset(full_reset=True) will recreate everything including the keep_on_reset status
"""
_instance = None
def __init__(self):
self.configs = {}
self.keep_on_reset = set()
@staticmethod
def get_instance():
if ArchivingContext._instance is None:
ArchivingContext._instance = ArchivingContext()
return ArchivingContext._instance
@staticmethod
def set(key, value, keep_on_reset: bool = False):
logger.error(f"SET [{key}]={value}")
ac = ArchivingContext.get_instance()
ac.configs[key] = value
if keep_on_reset: ac.keep_on_reset.add(key)
@staticmethod
def get(key: str, default=None):
return ArchivingContext.get_instance().configs.get(key, default)
@staticmethod
def reset(full_reset: bool = False):
ac = ArchivingContext.get_instance()
if full_reset: ac.keep_on_reset = set()
ac.configs = {k: v for k, v in ac.configs.items() if k in ac.keep_on_reset}
# ---- custom getters/setters for widely used context values
@staticmethod
def set_tmp_dir(tmp_dir: str):
ArchivingContext.get_instance().configs["tmp_dir"] = tmp_dir
@staticmethod
def get_tmp_dir() -> str:
return ArchivingContext.get_instance().configs.get("tmp_dir")

View File

@@ -3,18 +3,46 @@ from __future__ import annotations
from ast import List
from typing import Any
from dataclasses import dataclass, field
from dataclasses_json import dataclass_json
from dataclasses_json import dataclass_json, config
import mimetypes
# annotation order matters
@dataclass_json
from .context import ArchivingContext
from loguru import logger
@dataclass_json # annotation order matters
@dataclass
class Media:
filename: str
key: str = None
urls: List[str] = field(default_factory=list)
_mimetype: str = None # eg: image/jpeg
properties: dict = field(default_factory=dict)
_mimetype: str = None # eg: image/jpeg
_stored: bool = field(default=False, repr=False, metadata=config(exclude=lambda _: True)) # always exclude
def store(self: Media, override_storages: List = None, url: str = "url-not-available"):
# stores the media into the provided/available storages [Storage]
# repeats the process for its properties, in case they have inner media themselves
# for now it only goes down 1 level but it's easy to make it recursive if needed
storages = override_storages or ArchivingContext.get("storages")
if not len(storages):
logger.warning(f"No storages found in local context or provided directly for {self.filename}.")
return
for s in storages:
s.store(self, url)
# Media can be inside media properties, examples include transformations on original media
for prop in self.properties.values():
if isinstance(prop, Media):
s.store(prop, url)
if isinstance(prop, list):
for prop_media in prop:
if isinstance(prop_media, Media):
s.store(prop_media, url)
def is_stored(self) -> bool:
return len(self.urls) > 0
def set(self, key: str, value: Any) -> Media:
self.properties[key] = value
@@ -40,3 +68,6 @@ class Media:
def is_video(self) -> bool:
return self.mimetype.startswith("video")
def is_audio(self) -> bool:
return self.mimetype.startswith("audio")

View File

@@ -3,24 +3,25 @@ from __future__ import annotations
from ast import List, Set
from typing import Any, Union, Dict
from dataclasses import dataclass, field
from dataclasses_json import dataclass_json
from dataclasses_json import dataclass_json, config
import datetime
from urllib.parse import urlparse
from dateutil.parser import parse as parse_dt
from .media import Media
from .context import ArchivingContext
# annotation order matters
@dataclass_json
@dataclass_json # annotation order matters
@dataclass
class Metadata:
status: str = "no archiver"
_processed_at: datetime = field(default_factory=datetime.datetime.utcnow)
metadata: Dict[str, Any] = field(default_factory=dict)
tmp_keys: Set[str] = field(default_factory=set, repr=False, metadata={"exclude": True}) # keys that are not to be saved in DBs
media: List[Media] = field(default_factory=list)
rearchivable: bool = True # defaults to true, archivers can overwrite
def __post_init__(self):
self.set("_processed_at", datetime.datetime.utcnow())
def merge(self: Metadata, right: Metadata, overwrite_left=True) -> Metadata:
"""
merges two Metadata instances, will overwrite according to overwrite_left flag
@@ -30,7 +31,6 @@ class Metadata:
if right.status and len(right.status):
self.status = right.status
self.rearchivable |= right.rearchivable
self.tmp_keys |= right.tmp_keys
for k, v in right.metadata.items():
assert k not in self.metadata or type(v) == type(self.get(k))
if type(v) not in [dict, list, set] or k not in self.metadata:
@@ -43,10 +43,14 @@ class Metadata:
return right.merge(self)
return self
def set(self, key: str, val: Any, is_tmp=False) -> Metadata:
# if not self.metadata: self.metadata = {}
def store(self: Metadata, override_storages: List = None):
# calls .store for all contained media. storages [Storage]
storages = override_storages or ArchivingContext.get("storages")
for media in self.media:
media.store(override_storages=storages)
def set(self, key: str, val: Any) -> Metadata:
self.metadata[key] = val
if is_tmp: self.tmp_keys.add(key)
return self
def get(self, key: str, default: Any = None, create_if_missing=False) -> Union[Metadata, str]:
@@ -64,7 +68,7 @@ class Metadata:
return "success" in self.status
def is_empty(self) -> bool:
return not self.is_success() and len(self.media) == 0 and len(self.get_clean_metadata()) <= 2 # url, processed_at
return not self.is_success() and len(self.media) == 0 and len(self.metadata) <= 2 # url, processed_at
@property # getter .netloc
def netloc(self) -> str:
@@ -93,12 +97,6 @@ class Metadata:
def get_title(self) -> str:
return self.get("title")
def set_tmp_dir(self, tmp_dir: str) -> Metadata:
return self.set("tmp_dir", tmp_dir, True)
def get_tmp_dir(self) -> str:
return self.get("tmp_dir")
def set_timestamp(self, timestamp: datetime.datetime) -> Metadata:
if type(timestamp) == str:
timestamp = parse_dt(timestamp)
@@ -139,8 +137,5 @@ class Metadata:
_default = self.media[0] if len(self.media) else None
return self.get_media_by_id("_final_media", _default)
def get_clean_metadata(self) -> Metadata:
return dict(
{k: v for k, v in self.metadata.items() if k not in self.tmp_keys},
**{"processed_at": self._processed_at}
)
def __str__(self) -> str:
return self.__repr__()

View File

@@ -2,6 +2,8 @@ from __future__ import annotations
from ast import List
from typing import Union
from .context import ArchivingContext
from ..archivers import Archiver
from ..feeders import Feeder
from ..formatters import Formatter
@@ -23,6 +25,7 @@ class ArchivingOrchestrator:
self.archivers: List[Archiver] = config.archivers
self.databases: List[Database] = config.databases
self.storages: List[Storage] = config.storages
ArchivingContext.set("storages", self.storages, keep_on_reset=True)
for a in self.archivers: a.setup()
@@ -32,8 +35,9 @@ class ArchivingOrchestrator:
def feed_item(self, item: Metadata) -> Metadata:
try:
ArchivingContext.reset()
with tempfile.TemporaryDirectory(dir="./") as tmp_dir:
item.set_tmp_dir(tmp_dir)
ArchivingContext.set_tmp_dir(tmp_dir)
return self.archive(item)
except KeyboardInterrupt:
# catches keyboard interruptions to do a clean exit
@@ -105,22 +109,12 @@ class ArchivingOrchestrator:
# 5 - store media
# looks for Media in result.media and also result.media[x].properties (as list or dict values)
for s in self.storages:
for m in result.media:
s.store(m, result) # modifies media
# Media can be inside media properties, examples include transformations on original media
for prop in m.properties.values():
if isinstance(prop, Media):
s.store(prop, result)
if isinstance(prop, list) and len(prop) > 0 and isinstance(prop[0], Media):
for prop_media in prop:
s.store(prop_media, result)
result.store()
# 6 - format and store formatted if needed
# enrichers typically need access to already stored URLs etc
if (final_media := self.formatter.format(result)):
for s in self.storages:
s.store(final_media, result)
final_media.store()
result.set_final_media(final_media)
if result.is_empty():

View File

@@ -5,8 +5,7 @@ from urllib.parse import quote
from loguru import logger
from . import Database
from ..core import Metadata
from ..core import Media
from ..core import Metadata, Media, ArchivingContext
from ..utils import GWorksheet
@@ -86,7 +85,7 @@ class GsheetsDb(Database):
logger.debug(f"Unable to update sheet: {e}")
def _retrieve_gsheet(self, item: Metadata) -> Tuple[GWorksheet, int]:
# TODO: to make gsheet_db less coupled with gsheet_feeder's "gsheet" parameter, this method could 1st try to fetch "gsheet" from item and, if missing, manage its own singleton - not needed for now
gw: GWorksheet = item.get("gsheet").get("worksheet")
row: int = item.get("gsheet").get("row")
# TODO: to make gsheet_db less coupled with gsheet_feeder's "gsheet" parameter, this method could 1st try to fetch "gsheet" from ArchivingContext and, if missing, manage its own singleton - not needed for now
gw: GWorksheet = ArchivingContext.get("gsheet").get("worksheet")
row: int = ArchivingContext.get("gsheet").get("row")
return gw, row

View File

@@ -3,4 +3,5 @@ from .screenshot_enricher import ScreenshotEnricher
from .wayback_enricher import WaybackArchiverEnricher
from .hash_enricher import HashEnricher
from .thumbnail_enricher import ThumbnailEnricher
from .wacz_enricher import WaczEnricher
from .wacz_enricher import WaczEnricher
from .whisper_enricher import WhisperEnricher

View File

@@ -16,11 +16,13 @@ class HashEnricher(Enricher):
super().__init__(config)
algo_choices = self.configs()["algorithm"]["choices"]
assert self.algorithm in algo_choices, f"Invalid hash algorithm selected, must be one of {algo_choices} (you selected {self.algorithm})."
self.chunksize = int(self.chunksize)
@staticmethod
def configs() -> dict:
return {
"algorithm": {"default": "SHA-256", "help": "hash algorithm to use", "choices": ["SHA-256", "SHA3-512"]}
"algorithm": {"default": "SHA-256", "help": "hash algorithm to use", "choices": ["SHA-256", "SHA3-512"]},
"chunksize": {"default": 1.6e7, "help": "number of bytes to use when reading files in chunks (if this value is too large you will run out of RAM), default is 16MB"},
}
def enrich(self, to_enrich: Metadata) -> None:
@@ -28,12 +30,19 @@ class HashEnricher(Enricher):
logger.debug(f"calculating media hashes for {url=} (using {self.algorithm})")
for i, m in enumerate(to_enrich.media):
with open(m.filename, "rb") as f:
bytes = f.read() # read entire file as bytes
hash = None
if self.algorithm == "SHA-256":
hash = hashlib.sha256(bytes)
elif self.algorithm == "SHA3-512":
hash = hashlib.sha3_512(bytes)
else: continue
to_enrich.media[i].set("hash", f"{self.algorithm}:{hash.hexdigest()}")
if len(hd := self.calculate_hash(m.filename)):
to_enrich.media[i].set("hash", f"{self.algorithm}:{hd}")
def calculate_hash(self, filename):
hash = None
if self.algorithm == "SHA-256":
hash = hashlib.sha256()
elif self.algorithm == "SHA3-512":
hash = hashlib.sha3_512()
else: return ""
with open(filename, "rb") as f:
while True:
buf = f.read(self.chunksize)
if not buf: break
hash.update(buf)
return hash.hexdigest()

View File

@@ -4,7 +4,7 @@ from selenium.common.exceptions import TimeoutException
from . import Enricher
from ..utils import Webdriver, UrlUtil
from ..core import Media, Metadata
from ..core import Media, Metadata, ArchivingContext
class ScreenshotEnricher(Enricher):
name = "screenshot_enricher"
@@ -14,7 +14,8 @@ class ScreenshotEnricher(Enricher):
return {
"width": {"default": 1280, "help": "width of the screenshots"},
"height": {"default": 720, "help": "height of the screenshots"},
"timeout": {"default": 60, "help": "timeout for taking the screenshot"}
"timeout": {"default": 60, "help": "timeout for taking the screenshot"},
"sleep_before_screenshot": {"default": 4, "help": "seconds to wait for the pages to load before taking screenshot"}
}
def enrich(self, to_enrich: Metadata) -> None:
@@ -27,12 +28,11 @@ class ScreenshotEnricher(Enricher):
with Webdriver(self.width, self.height, self.timeout, 'facebook.com' in url) as driver:
try:
driver.get(url)
time.sleep(2)
screenshot_file = os.path.join(to_enrich.get_tmp_dir(), f"screenshot_{str(uuid.uuid4())[0:8]}.png")
time.sleep(int(self.sleep_before_screenshot))
screenshot_file = os.path.join(ArchivingContext.get_tmp_dir(), f"screenshot_{str(uuid.uuid4())[0:8]}.png")
driver.save_screenshot(screenshot_file)
to_enrich.add_media(Media(filename=screenshot_file), id="screenshot")
except TimeoutException:
logger.info("TimeoutException loading page for screenshot")
except Exception as e:
logger.error(f"Got error while loading webdriver for screenshot enricher: {e}")
# return None

View File

@@ -2,7 +2,7 @@ import ffmpeg, os, uuid
from loguru import logger
from . import Enricher
from ..core import Media, Metadata
from ..core import Media, Metadata, ArchivingContext
class ThumbnailEnricher(Enricher):
@@ -23,7 +23,7 @@ class ThumbnailEnricher(Enricher):
logger.debug(f"generating thumbnails")
for i, m in enumerate(to_enrich.media[::]):
if m.is_video():
folder = os.path.join(to_enrich.get_tmp_dir(), str(uuid.uuid4()))
folder = os.path.join(ArchivingContext.get_tmp_dir(), str(uuid.uuid4()))
os.makedirs(folder, exist_ok=True)
logger.debug(f"generating thumbnails for {m.filename}")
fps, duration = 0.5, m.get("duration")

View File

@@ -1,7 +1,7 @@
import os, shutil, subprocess, uuid
from loguru import logger
from ..core import Media, Metadata
from ..core import Media, Metadata, ArchivingContext
from . import Enricher
from ..utils import UrlUtil
@@ -34,7 +34,7 @@ class WaczEnricher(Enricher):
logger.debug(f"generating WACZ for {url=}")
collection = str(uuid.uuid4())[0:8]
browsertrix_home = os.path.abspath(to_enrich.get_tmp_dir())
browsertrix_home = os.path.abspath(ArchivingContext.get_tmp_dir())
cmd = [
"docker", "run",
"--rm", # delete container once it has completed running

View File

@@ -0,0 +1,123 @@
import traceback
import requests, time
from loguru import logger
from . import Enricher
from ..core import Metadata, Media, ArchivingContext
from ..storages import S3Storage
class WhisperEnricher(Enricher):
"""
Connects with a Whisper API service to get texts out of audio
whisper API repository: TODO
Only works if an S3 compatible storage is used
"""
name = "whisper_enricher"
def __init__(self, config: dict) -> None:
# without this STEP.__init__ is not called
super().__init__(config)
assert type(self.api_key) == str and len(self.api_key) > 0, "please provide a value for the whisper_enricher api_key"
self.timeout = int(self.timeout)
@staticmethod
def configs() -> dict:
return {
"api_endpoint": {"default": "https://whisper.spoettel.dev/api/v1", "help": "WhisperApi api endpoint"},
"api_key": {"default": None, "help": "WhisperApi api key for authentication"},
"timeout": {"default": 90, "help": "How many seconds to wait at most for a successful job completion."},
"action": {"default": "translation", "help": "which Whisper operation to execute", "choices": ["transcript", "translation", "language_detection"]},
}
def enrich(self, to_enrich: Metadata) -> None:
if not self._get_s3_storage():
logger.error("WhisperEnricher: To use the WhisperEnricher you need to use S3Storage so files are accessible publicly to the whisper service being called.")
return
url = to_enrich.get_url()
logger.debug(f"WHISPER[{self.action}]: iterating media items for {url=}.")
job_results = {}
for i, m in enumerate(to_enrich.media):
if m.is_video() or m.is_audio():
m.store()
try:
job_id = self.submit_job(m)
job_results[job_id] = False
logger.debug(f"JOB SUBMITTED: {job_id=} for {m.key=}")
to_enrich.media[i].set("whisper_model", {"job_id": job_id})
except Exception as e:
logger.error(f"Failed to submit whisper job for {m.filename=} with error {e}\n{traceback.format_exc()}")
job_results = self.check_jobs(job_results)
for i, m in enumerate(to_enrich.media):
if m.is_video() or m.is_audio():
job_id = to_enrich.media[i].get("whisper_model")["job_id"]
to_enrich.media[i].set("whisper_model", {
"job_id": job_id,
self.action: job_results[job_id]
})
def submit_job(self, media: Media):
s3 = self._get_s3_storage()
s3_url = s3.get_cdn_url(media)
assert s3_url in media.urls, f"Could not find S3 url ({s3_url}) in list of stored media urls "
payload = {
"url": s3_url,
"type": self.action,
# "language": "string" # may be a config
}
response = requests.post(f'{self.api_endpoint}/jobs', json=payload, headers={'Authorization': f'Bearer {self.api_key}'})
assert response.status_code == 201, f"calling the whisper api {self.api_endpoint} returned a non-success code: {response.status_code}"
logger.debug(response.json())
return response.json()['id']
def check_jobs(self, job_results: dict):
start_time = time.time()
all_completed = False
while not all_completed and (time.time() - start_time) <= self.timeout:
all_completed = True
for job_id in job_results:
if job_results[job_id]: continue
all_completed = False # at least one not ready
try: job_results[job_id] = self.check_job(job_id)
except Exception as e:
logger.error(f"Failed to check {job_id=} with error {e}\n{traceback.format_exc()}")
if not all_completed: time.sleep(3)
return job_results
def check_job(self, job_id):
r = requests.get(f'{self.api_endpoint}/jobs/{job_id}', headers={'Authorization': f'Bearer {self.api_key}'})
assert r.status_code == 200, f"Job status did not respond with 200, instead with: {r.status_code}"
j = r.json()
logger.debug(f"Checked job {job_id=} with status='{j['status']}'")
if j['status'] == "processing": return False
elif j['status'] == "error": return f"Error: {j['meta']['error']}"
elif j['status'] == "success":
r_res = requests.get(f'{self.api_endpoint}/jobs/{job_id}/artifacts', headers={'Authorization': f'Bearer {self.api_key}'})
assert r_res.status_code == 200, f"Job artifacts did not respond with 200, instead with: {r_res.status_code}"
logger.success(r_res.json())
result = []
for artifact in r_res.json():
subtitle = []
full_text = []
for i, d in enumerate(artifact.get("data")):
subtitle.append(f"{i+1}\n{d.get('start')} --> {d.get('end')}\n{d.get('text').strip()}")
full_text.append(d.get('text').strip())
if not len(subtitle): continue
result.append({
"subtitle": "\n".join(subtitle),
"full_text": "\n".join(full_text),
})
return result
return False
def _get_s3_storage(self) -> S3Storage:
try:
return next(s for s in ArchivingContext.get("storages") if s.__class__ == S3Storage)
except:
logger.warning("No S3Storage instance found in storages")
return

View File

@@ -1,7 +1,7 @@
from loguru import logger
from . import Feeder
from ..core import Metadata
from ..core import Metadata, ArchivingContext
class CLIFeeder(Feeder):
@@ -26,5 +26,7 @@ class CLIFeeder(Feeder):
def __iter__(self) -> Metadata:
for url in self.urls:
logger.debug(f"Processing {url}")
yield Metadata().set_url(url).set("folder", "cli", True)
yield Metadata().set_url(url)
ArchivingContext.set("folder", "cli")
logger.success(f"Processed {len(self.urls)} URL(s)")

View File

@@ -5,9 +5,10 @@ from slugify import slugify
# from . import Enricher
from . import Feeder
from ..core import Metadata
from ..core import Metadata, ArchivingContext
from ..utils import Gsheets, GWorksheet
class GsheetsFeeder(Gsheets, Feeder):
name = "gsheet_feeder"
@@ -31,7 +32,7 @@ class GsheetsFeeder(Gsheets, Feeder):
"help": "(CSV) explicitly block some worksheets from being processed",
"cli_set": lambda cli_val, cur_val: set(cli_val.split(","))
},
"use_sheet_names_in_stored_paths":{
"use_sheet_names_in_stored_paths": {
"default": True,
"help": "if True the stored files path will include 'workbook_name/worksheet_name/...'",
}
@@ -61,11 +62,12 @@ class GsheetsFeeder(Gsheets, Feeder):
if status not in ['', None]: continue
# All checks done - archival process starts here
m = Metadata().set_url(url).set("gsheet", {"row": row, "worksheet": gw}, True)
m = Metadata().set_url(url)
ArchivingContext.set("gsheet", {"row": row, "worksheet": gw}, keep_on_reset=True)
if self.use_sheet_names_in_stored_paths:
m.set("folder", os.path.join(slugify(self.sheet), slugify(wks.title)), True)
ArchivingContext.set("folder", os.path.join(slugify(self.sheet), slugify(wks.title)), True)
yield m
logger.success(f'Finished worksheet {wks.title}')
def should_process_sheet(self, sheet_name: str) -> bool:

View File

@@ -6,7 +6,7 @@ from urllib.parse import quote
from loguru import logger
from ..version import __version__
from ..core import Metadata, Media
from ..core import Metadata, Media, ArchivingContext
from . import Formatter
@@ -40,10 +40,10 @@ class HtmlFormatter(Formatter):
url=url,
title=item.get_title(),
media=item.media,
metadata=item.get_clean_metadata(),
metadata=item.metadata,
version=__version__
)
html_path = os.path.join(item.get_tmp_dir(), f"formatted{str(uuid.uuid4())}.html")
html_path = os.path.join(ArchivingContext.get_tmp_dir(), f"formatted{str(uuid.uuid4())}.html")
with open(html_path, mode="w", encoding="utf-8") as outf:
outf.write(content)
return Media(filename=html_path)

View File

@@ -29,7 +29,7 @@
margin: auto;
border: 1px solid;
border-collapse: collapse;
vertical-align:top;
vertical-align: top;
}
table.metadata td:first-child {
@@ -185,7 +185,11 @@
el.addEventListener("copy", (e) => {
e.preventDefault();
if (e.clipboardData) {
e.clipboardData.setData("text/plain", el.textContent);
if (el.hasAttribute("copy-value")) {
e.clipboardData.setData("text/plain", el.getAttribute("copy-value"));
} else {
e.clipboardData.setData("text/plain", el.textContent);
}
console.log(e.clipboardData.getData("text"))
showNotification("copied!")
}

View File

@@ -46,14 +46,16 @@ No preview available for {{ m.key }}.
{% endif %}
{% if links %}
<a href="{{ url }}">open</a> or
<a href="{{ url }}" download="">download</a>
<a href="{{ url }}" download="">download</a> or
{{ copy_urlize(url, "copy") }}
<br>
{% endif %}
{% endfor %}
{%- endmacro -%}
{% macro copy_urlize(val) -%}
{% macro copy_urlize(val, href_text) -%}
{% if val is mapping %}
<ul>
@@ -65,7 +67,11 @@ No preview available for {{ m.key }}.
</ul>
{% else %}
{% if href_text | length == 0 %}
<span class="copy">{{ val | string | urlize }}</span>
{% else %}
<span class="copy" copy-value="{{val}}">{{ href_text | string | urlize }}</span>
{% endif %}
{% endif %}
{%- endmacro -%}

View File

@@ -1,10 +1,10 @@
from __future__ import annotations
from abc import abstractmethod
from dataclasses import dataclass
import hashlib
from typing import IO, Any
from typing import IO
from ..core import Media, Metadata, Step
from ..core import Media, Step, ArchivingContext
from ..enrichers import HashEnricher
from loguru import logger
import os, uuid
from slugify import slugify
@@ -41,8 +41,11 @@ class Storage(Step):
# only for typing...
return Step.init(name, config, Storage)
def store(self, media: Media, item: Metadata) -> None:
self.set_key(media, item)
def store(self, media: Media, url: str) -> None:
if media.is_stored():
logger.debug(f"{self.key} already stored, skipping")
return
self.set_key(media, url)
self.upload(media)
media.add_url(self.get_cdn_url(media))
@@ -57,25 +60,25 @@ class Storage(Step):
with open(media.filename, 'rb') as f:
return self.uploadf(f, media, **kwargs)
def set_key(self, media: Media, item: Metadata) -> None:
def set_key(self, media: Media, url) -> None:
"""takes the media and optionally item info and generates a key"""
if media.key is not None and len(media.key) > 0: return
folder = item.get("folder", "")
folder = ArchivingContext.get("folder", "")
filename, ext = os.path.splitext(media.filename)
# path_generator logic
if self.path_generator == "flat":
if self.path_generator == "flat":
path = ""
filename = slugify(filename) # in case it comes with os.sep
elif self.path_generator == "url": path = slugify(item.get_url())
filename = slugify(filename) # in case it comes with os.sep
elif self.path_generator == "url": path = slugify(url)
elif self.path_generator == "random":
path = item.get("random_path", str(uuid.uuid4())[:16], True)
path = ArchivingContext.get("random_path", str(uuid.uuid4())[:16], True)
# filename_generator logic
if self.filename_generator == "random": filename = str(uuid.uuid4())[:16]
elif self.filename_generator == "static":
with open(media.filename, "rb") as f:
bytes = f.read() # read entire file as bytes
filename = hashlib.sha256(bytes).hexdigest()[:24]
elif self.filename_generator == "static":
he = HashEnricher({"hash_enricher": {"algorithm": "SHA-256", "chunksize": 1.6e7}})
hd = he.calculate_hash(media.filename)
filename = hd[:24]
media.key = os.path.join(folder, path, f"{filename}{ext}")
media.key = os.path.join(folder, path, f"{filename}{ext}")

View File

@@ -1,9 +1,9 @@
_MAJOR = "0"
_MINOR = "4"
_MINOR = "5"
# On main and in a nightly release the patch should be one ahead of the last
# released build.
_PATCH = "2"
_PATCH = "6"
# This is mainly for nightly builds which have the suffix ".dev$DATE". See
# https://semver.org/#is-v123-a-semantic-version for the semantics.
_SUFFIX = ""