From 906ed0f6e0680736079a3a27ebd2be521ff9d37e Mon Sep 17 00:00:00 2001 From: msramalho <19508417+msramalho@users.noreply.github.com> Date: Thu, 23 Mar 2023 11:17:38 +0000 Subject: [PATCH] creating global context and refactoring tmp_dir logic --- src/auto_archiver/archivers/archiver.py | 6 +-- .../archivers/instagram_tbot_archiver.py | 4 +- .../archivers/telethon_archiver.py | 4 +- .../archivers/tiktok_archiver.py | 4 +- src/auto_archiver/archivers/vk_archiver.py | 4 +- .../archivers/youtubedl_archiver.py | 4 +- src/auto_archiver/core/__init__.py | 1 + src/auto_archiver/core/context.py | 38 +++++++++++++++++++ src/auto_archiver/core/media.py | 15 +++++++- src/auto_archiver/core/metadata.py | 19 +++++----- src/auto_archiver/core/orchestrator.py | 5 ++- .../enrichers/screenshot_enricher.py | 4 +- .../enrichers/thumbnail_enricher.py | 4 +- src/auto_archiver/enrichers/wacz_enricher.py | 4 +- .../formatters/html_formatter.py | 4 +- src/auto_archiver/version.py | 2 +- 16 files changed, 88 insertions(+), 34 deletions(-) create mode 100644 src/auto_archiver/core/context.py diff --git a/src/auto_archiver/archivers/archiver.py b/src/auto_archiver/archivers/archiver.py index c986536..419ee7a 100644 --- a/src/auto_archiver/archivers/archiver.py +++ b/src/auto_archiver/archivers/archiver.py @@ -3,8 +3,8 @@ from abc import abstractmethod from dataclasses import dataclass import os import mimetypes, requests -from ..core import Metadata -from ..core import Step + +from ..core import Metadata, Step, ArchivingContext @dataclass @@ -51,7 +51,7 @@ class Archiver(Step): if len(to_filename) > 64: to_filename = to_filename[-64:] if item: - to_filename = os.path.join(item.get_tmp_dir(), to_filename) + to_filename = os.path.join(ArchivingContext.get_tmp_dir(), to_filename) headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36' } diff --git a/src/auto_archiver/archivers/instagram_tbot_archiver.py b/src/auto_archiver/archivers/instagram_tbot_archiver.py index 3b23b58..55141ac 100644 --- a/src/auto_archiver/archivers/instagram_tbot_archiver.py +++ b/src/auto_archiver/archivers/instagram_tbot_archiver.py @@ -4,7 +4,7 @@ from loguru import logger import time, os from sqlite3 import OperationalError from . import Archiver -from ..core import Metadata, Media +from ..core import Metadata, Media, ArchivingContext class InstagramTbotArchiver(Archiver): @@ -44,7 +44,7 @@ class InstagramTbotArchiver(Archiver): if not "instagram.com" in url: return False result = Metadata() - tmp_dir = item.get_tmp_dir() + tmp_dir = ArchivingContext.get_tmp_dir() with self.client.start(): chat = self.client.get_entity("instagram_load_bot") since_id = self.client.send_message(entity=chat, message=url).id diff --git a/src/auto_archiver/archivers/telethon_archiver.py b/src/auto_archiver/archivers/telethon_archiver.py index 5cd6148..67b5b59 100644 --- a/src/auto_archiver/archivers/telethon_archiver.py +++ b/src/auto_archiver/archivers/telethon_archiver.py @@ -8,7 +8,7 @@ from tqdm import tqdm import re, time, json, os from . import Archiver -from ..core import Metadata, Media +from ..core import Metadata, Media, ArchivingContext class TelethonArchiver(Archiver): @@ -128,7 +128,7 @@ class TelethonArchiver(Archiver): media_posts = self._get_media_posts_in_group(chat, post) logger.debug(f'got {len(media_posts)=} for {url=}') - tmp_dir = item.get_tmp_dir() + tmp_dir = ArchivingContext.get_tmp_dir() group_id = post.grouped_id if post.grouped_id is not None else post.id title = post.message diff --git a/src/auto_archiver/archivers/tiktok_archiver.py b/src/auto_archiver/archivers/tiktok_archiver.py index 601ef51..e1fc88f 100644 --- a/src/auto_archiver/archivers/tiktok_archiver.py +++ b/src/auto_archiver/archivers/tiktok_archiver.py @@ -3,7 +3,7 @@ import tiktok_downloader from loguru import logger from . import Archiver -from ..core import Metadata, Media +from ..core import Metadata, Media, ArchivingContext class TiktokArchiver(Archiver): @@ -41,7 +41,7 @@ class TiktokArchiver(Archiver): logger.warning(f'Other Tiktok error {error}') try: - filename = os.path.join(item.get_tmp_dir(), f'{str(uuid.uuid4())[0:8]}.mp4') + filename = os.path.join(ArchivingContext.get_tmp_dir(), f'{str(uuid.uuid4())[0:8]}.mp4') tiktok_media = tiktok_downloader.snaptik(url).get_media() if len(tiktok_media) <= 0: diff --git a/src/auto_archiver/archivers/vk_archiver.py b/src/auto_archiver/archivers/vk_archiver.py index bbc3456..8defb96 100644 --- a/src/auto_archiver/archivers/vk_archiver.py +++ b/src/auto_archiver/archivers/vk_archiver.py @@ -3,7 +3,7 @@ from vk_url_scraper import VkScraper from ..utils.misc import dump_payload from . import Archiver -from ..core import Metadata, Media +from ..core import Metadata, Media, ArchivingContext class VkArchiver(Archiver): @@ -50,7 +50,7 @@ class VkArchiver(Archiver): result.set_content(dump_payload(vk_scrapes)) - filenames = self.vks.download_media(vk_scrapes, item.get_tmp_dir()) + filenames = self.vks.download_media(vk_scrapes, ArchivingContext.get_tmp_dir()) for filename in filenames: result.add_media(Media(filename)) diff --git a/src/auto_archiver/archivers/youtubedl_archiver.py b/src/auto_archiver/archivers/youtubedl_archiver.py index d136c61..92637c0 100644 --- a/src/auto_archiver/archivers/youtubedl_archiver.py +++ b/src/auto_archiver/archivers/youtubedl_archiver.py @@ -2,7 +2,7 @@ import datetime, os, yt_dlp from loguru import logger from . import Archiver -from ..core import Metadata, Media +from ..core import Metadata, Media, ArchivingContext class YoutubeDLArchiver(Archiver): @@ -25,7 +25,7 @@ class YoutubeDLArchiver(Archiver): logger.debug('Using Facebook cookie') yt_dlp.utils.std_headers['cookie'] = self.facebook_cookie - ydl = yt_dlp.YoutubeDL({'outtmpl': os.path.join(item.get_tmp_dir(), f'%(id)s.%(ext)s'), 'quiet': False}) + ydl = yt_dlp.YoutubeDL({'outtmpl': os.path.join(ArchivingContext.get_tmp_dir(), f'%(id)s.%(ext)s'), 'quiet': False}) try: # don'd download since it can be a live stream diff --git a/src/auto_archiver/core/__init__.py b/src/auto_archiver/core/__init__.py index 04f381e..d9a04bd 100644 --- a/src/auto_archiver/core/__init__.py +++ b/src/auto_archiver/core/__init__.py @@ -1,6 +1,7 @@ from .media import Media from .metadata import Metadata from .step import Step +from .context import ArchivingContext # cannot import ArchivingOrchestrator/Config to avoid circular dep # from .orchestrator import ArchivingOrchestrator diff --git a/src/auto_archiver/core/context.py b/src/auto_archiver/core/context.py new file mode 100644 index 0000000..c1709e7 --- /dev/null +++ b/src/auto_archiver/core/context.py @@ -0,0 +1,38 @@ + +class ArchivingContext: + """ + Singleton context class. + ArchivingContext._get_instance() to retrieve it if needed + otherwise just + ArchivingContext.set(key, value) + and + ArchivingContext.get(key, default) + """ + _instance = None + + def __init__(self): + self.configs = {} + + @staticmethod + def get_instance(): + if ArchivingContext._instance is None: + ArchivingContext._instance = ArchivingContext() + return ArchivingContext._instance + + @staticmethod + def set(key, value): + ArchivingContext.get_instance().configs[key] = value + + @staticmethod + def get(key: str, default=None): + return ArchivingContext.get_instance().configs.get(key, default) + + # ---- custom getters/setters for widely used context values + + @staticmethod + def set_tmp_dir(tmp_dir: str): + ArchivingContext.get_instance().configs["tmp_dir"] = tmp_dir + + @staticmethod + def get_tmp_dir() -> str: + return ArchivingContext.get_instance().configs.get("tmp_dir") diff --git a/src/auto_archiver/core/media.py b/src/auto_archiver/core/media.py index 40cab1f..53f2a0b 100644 --- a/src/auto_archiver/core/media.py +++ b/src/auto_archiver/core/media.py @@ -6,8 +6,9 @@ from dataclasses import dataclass, field from dataclasses_json import dataclass_json import mimetypes -# annotation order matters -@dataclass_json + + +@dataclass_json # annotation order matters @dataclass class Media: filename: str @@ -40,3 +41,13 @@ class Media: def is_video(self) -> bool: return self.mimetype.startswith("video") + + def is_audio(self) -> bool: + return self.mimetype.startswith("audio") + + def store(self): + """ + either stores this media entry and all its media descendants + or returns if that process is already completed + """ + pass diff --git a/src/auto_archiver/core/metadata.py b/src/auto_archiver/core/metadata.py index b38af0c..2ae583d 100644 --- a/src/auto_archiver/core/metadata.py +++ b/src/auto_archiver/core/metadata.py @@ -3,13 +3,12 @@ from __future__ import annotations from ast import List, Set from typing import Any, Union, Dict from dataclasses import dataclass, field -from dataclasses_json import dataclass_json +from dataclasses_json import dataclass_json, config import datetime from urllib.parse import urlparse from dateutil.parser import parse as parse_dt from .media import Media - # annotation order matters @dataclass_json @dataclass @@ -17,10 +16,14 @@ class Metadata: status: str = "no archiver" _processed_at: datetime = field(default_factory=datetime.datetime.utcnow) metadata: Dict[str, Any] = field(default_factory=dict) - tmp_keys: Set[str] = field(default_factory=set, repr=False, metadata={"exclude": True}) # keys that are not to be saved in DBs media: List[Media] = field(default_factory=list) rearchivable: bool = True # defaults to true, archivers can overwrite + # properties below are excluded from JSON representation + tmp_keys: Set[str] = field(default_factory=set, repr=False, metadata=config(exclude=True)) + # tmp_metadata: Dict[str, Any] = field(default_factory=dict, repr=False, metadata=config(exclude=True)) # contains internal properties not to be leaked when .to_json/repr/str is called + + def merge(self: Metadata, right: Metadata, overwrite_left=True) -> Metadata: """ merges two Metadata instances, will overwrite according to overwrite_left flag @@ -93,12 +96,6 @@ class Metadata: def get_title(self) -> str: return self.get("title") - def set_tmp_dir(self, tmp_dir: str) -> Metadata: - return self.set("tmp_dir", tmp_dir, True) - - def get_tmp_dir(self) -> str: - return self.get("tmp_dir") - def set_timestamp(self, timestamp: datetime.datetime) -> Metadata: if type(timestamp) == str: timestamp = parse_dt(timestamp) @@ -144,3 +141,7 @@ class Metadata: {k: v for k, v in self.metadata.items() if k not in self.tmp_keys}, **{"processed_at": self._processed_at} ) + + def __str__(self) -> str: + return self.__repr__() + \ No newline at end of file diff --git a/src/auto_archiver/core/orchestrator.py b/src/auto_archiver/core/orchestrator.py index db3c893..03339fc 100644 --- a/src/auto_archiver/core/orchestrator.py +++ b/src/auto_archiver/core/orchestrator.py @@ -2,6 +2,8 @@ from __future__ import annotations from ast import List from typing import Union +from .context import ArchivingContext + from ..archivers import Archiver from ..feeders import Feeder from ..formatters import Formatter @@ -23,6 +25,7 @@ class ArchivingOrchestrator: self.archivers: List[Archiver] = config.archivers self.databases: List[Database] = config.databases self.storages: List[Storage] = config.storages + ArchivingContext.set("storages", self.storages) for a in self.archivers: a.setup() @@ -33,7 +36,7 @@ class ArchivingOrchestrator: def feed_item(self, item: Metadata) -> Metadata: try: with tempfile.TemporaryDirectory(dir="./") as tmp_dir: - item.set_tmp_dir(tmp_dir) + ArchivingContext.set_tmp_dir(tmp_dir) return self.archive(item) except KeyboardInterrupt: # catches keyboard interruptions to do a clean exit diff --git a/src/auto_archiver/enrichers/screenshot_enricher.py b/src/auto_archiver/enrichers/screenshot_enricher.py index 8b9e3c5..7be01f0 100644 --- a/src/auto_archiver/enrichers/screenshot_enricher.py +++ b/src/auto_archiver/enrichers/screenshot_enricher.py @@ -4,7 +4,7 @@ from selenium.common.exceptions import TimeoutException from . import Enricher from ..utils import Webdriver, UrlUtil -from ..core import Media, Metadata +from ..core import Media, Metadata, ArchivingContext class ScreenshotEnricher(Enricher): name = "screenshot_enricher" @@ -29,7 +29,7 @@ class ScreenshotEnricher(Enricher): try: driver.get(url) time.sleep(int(self.sleep_before_screenshot)) - screenshot_file = os.path.join(to_enrich.get_tmp_dir(), f"screenshot_{str(uuid.uuid4())[0:8]}.png") + screenshot_file = os.path.join(ArchivingContext.get_tmp_dir(), f"screenshot_{str(uuid.uuid4())[0:8]}.png") driver.save_screenshot(screenshot_file) to_enrich.add_media(Media(filename=screenshot_file), id="screenshot") except TimeoutException: diff --git a/src/auto_archiver/enrichers/thumbnail_enricher.py b/src/auto_archiver/enrichers/thumbnail_enricher.py index 2816db5..480f186 100644 --- a/src/auto_archiver/enrichers/thumbnail_enricher.py +++ b/src/auto_archiver/enrichers/thumbnail_enricher.py @@ -2,7 +2,7 @@ import ffmpeg, os, uuid from loguru import logger from . import Enricher -from ..core import Media, Metadata +from ..core import Media, Metadata, ArchivingContext class ThumbnailEnricher(Enricher): @@ -23,7 +23,7 @@ class ThumbnailEnricher(Enricher): logger.debug(f"generating thumbnails") for i, m in enumerate(to_enrich.media[::]): if m.is_video(): - folder = os.path.join(to_enrich.get_tmp_dir(), str(uuid.uuid4())) + folder = os.path.join(ArchivingContext.get_tmp_dir(), str(uuid.uuid4())) os.makedirs(folder, exist_ok=True) logger.debug(f"generating thumbnails for {m.filename}") fps, duration = 0.5, m.get("duration") diff --git a/src/auto_archiver/enrichers/wacz_enricher.py b/src/auto_archiver/enrichers/wacz_enricher.py index 9e141fb..f594efe 100644 --- a/src/auto_archiver/enrichers/wacz_enricher.py +++ b/src/auto_archiver/enrichers/wacz_enricher.py @@ -1,7 +1,7 @@ import os, shutil, subprocess, uuid from loguru import logger -from ..core import Media, Metadata +from ..core import Media, Metadata, ArchivingContext from . import Enricher from ..utils import UrlUtil @@ -34,7 +34,7 @@ class WaczEnricher(Enricher): logger.debug(f"generating WACZ for {url=}") collection = str(uuid.uuid4())[0:8] - browsertrix_home = os.path.abspath(to_enrich.get_tmp_dir()) + browsertrix_home = os.path.abspath(ArchivingContext.get_tmp_dir()) cmd = [ "docker", "run", "--rm", # delete container once it has completed running diff --git a/src/auto_archiver/formatters/html_formatter.py b/src/auto_archiver/formatters/html_formatter.py index abd8cf4..80722d3 100644 --- a/src/auto_archiver/formatters/html_formatter.py +++ b/src/auto_archiver/formatters/html_formatter.py @@ -6,7 +6,7 @@ from urllib.parse import quote from loguru import logger from ..version import __version__ -from ..core import Metadata, Media +from ..core import Metadata, Media, ArchivingContext from . import Formatter @@ -43,7 +43,7 @@ class HtmlFormatter(Formatter): metadata=item.get_clean_metadata(), version=__version__ ) - html_path = os.path.join(item.get_tmp_dir(), f"formatted{str(uuid.uuid4())}.html") + html_path = os.path.join(ArchivingContext.get_tmp_dir(), f"formatted{str(uuid.uuid4())}.html") with open(html_path, mode="w", encoding="utf-8") as outf: outf.write(content) return Media(filename=html_path) diff --git a/src/auto_archiver/version.py b/src/auto_archiver/version.py index 4685bb5..3a712c1 100644 --- a/src/auto_archiver/version.py +++ b/src/auto_archiver/version.py @@ -1,6 +1,6 @@ _MAJOR = "0" -_MINOR = "4" +_MINOR = "5" # On main and in a nightly release the patch should be one ahead of the last # released build. _PATCH = "5"