From b763fc418838b30433e05877d396c02c7d50a0fb Mon Sep 17 00:00:00 2001 From: msramalho <19508417+msramalho@users.noreply.github.com> Date: Sat, 21 Jan 2023 19:44:12 +0000 Subject: [PATCH] final naming cleanup + new feeders/dbs --- .gitignore | 3 +- src/auto_archiver/__init__.py | 2 +- src/auto_archiver/__main__.py | 4 +- src/auto_archiver/archivers/__init__.py | 18 +- src/auto_archiver/archivers/archiver.py | 6 +- src/auto_archiver/archivers/base_archiver.py | 384 ------------------ ...am_archiverv2.py => instagram_archiver.py} | 4 +- ...ram_archiverv2.py => telegram_archiver.py} | 4 +- ...hon_archiverv2.py => telethon_archiver.py} | 4 +- ...iktok_archiverv2.py => tiktok_archiver.py} | 4 +- ..._archiverv2.py => twitter_api_archiver.py} | 6 +- ...tter_archiverv2.py => twitter_archiver.py} | 4 +- .../{vk_archiverv2.py => vk_archiver.py} | 4 +- ...dl_archiverv2.py => youtubedl_archiver.py} | 4 +- src/auto_archiver/auto_archive.py | 176 -------- src/auto_archiver/auto_auto_archive.py | 55 +-- src/auto_archiver/core/__init__.py | 2 +- .../core/{v2config.py => config.py} | 16 +- src/auto_archiver/core/metadata.py | 4 +- src/auto_archiver/core/orchestrator.py | 16 +- src/auto_archiver/core/step.py | 2 +- src/auto_archiver/databases/__init__.py | 3 +- src/auto_archiver/databases/csv_db.py | 33 ++ src/auto_archiver/databases/database.py | 1 - .../enrichers/wayback_enricher.py | 4 +- src/auto_archiver/feeders/__init__.py | 3 +- src/auto_archiver/feeders/cli_feeder.py | 33 ++ src/auto_archiver/storages/__init__.py | 12 +- src/auto_archiver/storages/base_storage.py | 33 -- src/auto_archiver/storages/gd_storage.py | 311 +++++++------- src/auto_archiver/storages/local.py | 4 +- src/auto_archiver/storages/s3.py | 4 +- src/auto_archiver/storages/s3_storage.py | 80 ---- src/auto_archiver/storages/storage.py | 6 +- 34 files changed, 322 insertions(+), 927 deletions(-) delete mode 100644 src/auto_archiver/archivers/base_archiver.py rename src/auto_archiver/archivers/{instagram_archiverv2.py => instagram_archiver.py} (99%) rename src/auto_archiver/archivers/{telegram_archiverv2.py => telegram_archiver.py} (97%) rename src/auto_archiver/archivers/{telethon_archiverv2.py => telethon_archiver.py} (99%) rename src/auto_archiver/archivers/{tiktok_archiverv2.py => tiktok_archiver.py} (96%) rename src/auto_archiver/archivers/{twitter_api_archiverv2.py => twitter_api_archiver.py} (97%) rename src/auto_archiver/archivers/{twitter_archiverv2.py => twitter_archiver.py} (98%) rename src/auto_archiver/archivers/{vk_archiverv2.py => vk_archiver.py} (97%) rename src/auto_archiver/archivers/{youtubedl_archiverv2.py => youtubedl_archiver.py} (97%) delete mode 100644 src/auto_archiver/auto_archive.py rename src/auto_archiver/core/{v2config.py => config.py} (93%) create mode 100644 src/auto_archiver/databases/csv_db.py create mode 100644 src/auto_archiver/feeders/cli_feeder.py delete mode 100644 src/auto_archiver/storages/base_storage.py delete mode 100644 src/auto_archiver/storages/s3_storage.py diff --git a/.gitignore b/.gitignore index 2c55563..b3a4b36 100644 --- a/.gitignore +++ b/.gitignore @@ -26,4 +26,5 @@ instaloader/* instaloader.session orchestration.yaml auto_archiver.egg-info* -logs* \ No newline at end of file +logs* +*.csv \ No newline at end of file diff --git a/src/auto_archiver/__init__.py b/src/auto_archiver/__init__.py index c02d330..adb8551 100644 --- a/src/auto_archiver/__init__.py +++ b/src/auto_archiver/__init__.py @@ -2,6 +2,6 @@ from . import archivers, databases, enrichers, feeders, formatters, storages, ut # need to manually specify due to cyclical deps from .core.orchestrator import ArchivingOrchestrator -from .core.v2config import ConfigV2 +from .core.config import Config # making accessible directly from .core.metadata import Metadata \ No newline at end of file diff --git a/src/auto_archiver/__main__.py b/src/auto_archiver/__main__.py index 812e4e2..e8a81c4 100644 --- a/src/auto_archiver/__main__.py +++ b/src/auto_archiver/__main__.py @@ -1,8 +1,8 @@ -from . import ConfigV2 +from . import Config from . import ArchivingOrchestrator def main(): - config = ConfigV2() + config = Config() config.parse() orchestrator = ArchivingOrchestrator(config) orchestrator.feed() diff --git a/src/auto_archiver/archivers/__init__.py b/src/auto_archiver/archivers/__init__.py index 595ea8c..b5c4faa 100644 --- a/src/auto_archiver/archivers/__init__.py +++ b/src/auto_archiver/archivers/__init__.py @@ -10,12 +10,12 @@ # from .twitter_api_archiver import TwitterApiArchiver # from .instagram_archiver import InstagramArchiver -from .archiver import Archiverv2 -from .telethon_archiverv2 import TelethonArchiver -from .twitter_archiverv2 import TwitterArchiver -from .twitter_api_archiverv2 import TwitterApiArchiver -from .instagram_archiverv2 import InstagramArchiver -from .tiktok_archiverv2 import TiktokArchiver -from .telegram_archiverv2 import TelegramArchiver -from .vk_archiverv2 import VkArchiver -from .youtubedl_archiverv2 import YoutubeDLArchiver \ No newline at end of file +from .archiver import Archiver +from .telethon_archiver import TelethonArchiver +from .twitter_archiver import TwitterArchiver +from .twitter_api_archiver import TwitterApiArchiver +from .instagram_archiver import InstagramArchiver +from .tiktok_archiver import TiktokArchiver +from .telegram_archiver import TelegramArchiver +from .vk_archiver import VkArchiver +from .youtubedl_archiver import YoutubeDLArchiver \ No newline at end of file diff --git a/src/auto_archiver/archivers/archiver.py b/src/auto_archiver/archivers/archiver.py index d09044e..49efd66 100644 --- a/src/auto_archiver/archivers/archiver.py +++ b/src/auto_archiver/archivers/archiver.py @@ -8,16 +8,16 @@ from ..core import Step @dataclass -class Archiverv2(Step): +class Archiver(Step): name = "archiver" def __init__(self, config: dict) -> None: # without this STEP.__init__ is not called super().__init__(config) - def init(name: str, config: dict) -> Archiverv2: + def init(name: str, config: dict) -> Archiver: # only for typing... - return Step.init(name, config, Archiverv2) + return Step.init(name, config, Archiver) def setup(self) -> None: # used when archivers need to login or do other one-time setup diff --git a/src/auto_archiver/archivers/base_archiver.py b/src/auto_archiver/archivers/base_archiver.py deleted file mode 100644 index 103b9ff..0000000 --- a/src/auto_archiver/archivers/base_archiver.py +++ /dev/null @@ -1,384 +0,0 @@ -import os, datetime, shutil, hashlib, time, requests, re, mimetypes, subprocess -from dataclasses import dataclass, field -from abc import ABC, abstractmethod -from urllib.parse import urlparse -from random import randrange -from collections import defaultdict - -import ffmpeg -from loguru import logger -from selenium.common.exceptions import TimeoutException -from selenium.webdriver.common.by import By -from slugify import slugify - -from ..configs import Config -from storages import Storage -from utils import mkdir_if_not_exists - - -@dataclass -class ArchiveResult: - status: str - cdn_url: str = None - thumbnail: str = None - thumbnail_index: str = None - duration: float = None - title: str = None - timestamp: datetime.datetime = None - screenshot: str = None - wacz: str = None - hash: str = None - media: list = field(default_factory=list) - -class Archiver(ABC): - name = "default" - retry_regex = r"retrying at (\d+)$" - - def __init__(self, storage: Storage, config: Config): - self.storage = storage - self.driver = config.webdriver - self.hash_algorithm = config.hash_algorithm - self.browsertrix = config.browsertrix_config - self.is_docker = config.is_docker - self.media = [] - - def __str__(self): - return self.__class__.__name__ - - def __repr__(self): - return self.__str__() - - @abstractmethod - def download(self, url, check_if_exists=False): pass - - def generateArchiveResult(self, **kwargs): - # remove duplicates - if "cdn_url" in kwargs: - self.add_to_media(kwargs["cdn_url"], None, kwargs.get("hash")) - kwargs["media"] = [dict(t) for t in {tuple(d.items()) for d in self.media}] - return ArchiveResult(**kwargs) - - def get_netloc(self, url): - return urlparse(url).netloc - - def add_to_media(self, cdn_url: str, key: str = None, hash: str = None): - media_info = {"url": cdn_url, "mime": self._guess_file_type(cdn_url) or "misc"} - if key: media_info["key"] = key - if hash: media_info["hash"] = hash - self.media.append(media_info) - - def generate_media_page_html(self, url, urls_info: dict, object, thumbnail=None): - """ - Generates an index.html page where each @urls_info is displayed - """ - for ui in urls_info: - self.add_to_media(ui["cdn_url"], ui["key"], ui["hash"]) - page = f'''{url} - -

Archived media from {self.name}

-

{url}

{self.name} object data:

{object}" - page += f"" - - page_key = self.get_html_key(url) - page_filename = os.path.join(Storage.TMP_FOLDER, page_key) - - with open(page_filename, "w") as f: - f.write(page) - - page_hash = self.get_hash(page_filename) - - self.storage.upload(page_filename, page_key, extra_args={ - 'ACL': 'public-read', 'ContentType': 'text/html'}) - - page_cdn = self.storage.get_cdn_url(page_key) - return (page_cdn, page_hash, thumbnail) - - def _guess_file_type(self, path: str): - """ - Receives a URL or filename and returns global mimetype like 'image' or 'video' - see https://developer.mozilla.org/en-US/docs/Web/HTTP/Basics_of_HTTP/MIME_types/Common_types - """ - mime = mimetypes.guess_type(path)[0] - if mime is not None: - return mime.split("/")[0] - return "" - - def download_from_url(self, url, to_filename): - headers = { - 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36' - } - d = requests.get(url, headers=headers) - with open(to_filename, 'wb') as f: - f.write(d.content) - - def generate_media_page(self, urls, url, object): - """ - For a list of media urls, fetch them, upload them - and call self.generate_media_page_html with them - """ - for media_url in urls: - self.add_to_media(media_url) - - thumbnail = None - uploaded_media = [] - for media_url in urls: - key = self._get_key_from_url(media_url, ".jpg") - - filename = os.path.join(Storage.TMP_FOLDER, key) - self.download_from_url(media_url, filename) - self.storage.upload(filename, key) - hash = self.get_hash(filename) - cdn_url = self.storage.get_cdn_url(key) - - if thumbnail is None: - thumbnail = cdn_url - uploaded_media.append({'cdn_url': cdn_url, 'key': key, 'hash': hash}) - - return self.generate_media_page_html(url, uploaded_media, object, thumbnail=thumbnail) - - def get_key(self, filename): - """ - returns a key in the format "[archiverName]_[filename]" includes extension - """ - tail = os.path.split(filename)[1] # returns filename.ext from full path - _id, extension = os.path.splitext(tail) # returns [filename, .ext] - if 'unknown_video' in _id: - _id = _id.replace('unknown_video', 'jpg') - - # long filenames can cause problems, so trim them if necessary - if len(_id) > 128: - _id = _id[-128:] - - return f'{self.name}_{_id}{extension}' - - def get_html_key(self, url): - return self._get_key_from_url(url, ".html") - - def _get_key_from_url(self, url, with_extension: str = None, append_datetime: bool = False): - """ - Receives a URL and returns a slugified version of the URL path - if a string is passed in @with_extension the slug is appended with it if there is no "." in the slug - if @append_date is true, the key adds a timestamp after the URL slug and before the extension - """ - url_path = urlparse(url).path - path, ext = os.path.splitext(url_path) - slug = slugify(path) - if append_datetime: - slug += "-" + slugify(datetime.datetime.utcnow().isoformat()) - if len(ext): - slug += ext - if with_extension is not None: - if "." not in slug: - slug += with_extension - return self.get_key(slug) - - def get_hash(self, filename): - with open(filename, "rb") as f: - bytes = f.read() # read entire file as bytes - logger.debug(f'Hash algorithm is {self.hash_algorithm}') - - if self.hash_algorithm == "SHA-256": hash = hashlib.sha256(bytes) - elif self.hash_algorithm == "SHA3-512": hash = hashlib.sha3_512(bytes) - else: raise Exception(f"Unknown Hash Algorithm of {self.hash_algorithm}") - - return hash.hexdigest() - - def get_screenshot(self, url): - logger.debug(f"getting screenshot for {url=}") - key = self._get_key_from_url(url, ".png", append_datetime=True) - filename = os.path.join(Storage.TMP_FOLDER, key) - - # Accept cookies popup dismiss for ytdlp video - if 'facebook.com' in url: - try: - logger.debug(f'Trying fb click accept cookie popup for {url}') - self.driver.get("http://www.facebook.com") - foo = self.driver.find_element(By.XPATH, "//button[@data-cookiebanner='accept_only_essential_button']") - foo.click() - logger.debug(f'fb click worked') - # linux server needs a sleep otherwise facebook cookie won't have worked and we'll get a popup on next page - time.sleep(2) - except: - logger.warning(f'Failed on fb accept cookies for url {url}') - - try: - self.driver.get(url) - time.sleep(6) - except TimeoutException: - logger.info("TimeoutException loading page for screenshot") - - self.driver.save_screenshot(filename) - self.storage.upload(filename, key, extra_args={'ACL': 'public-read', 'ContentType': 'image/png'}) - - cdn_url = self.storage.get_cdn_url(key) - self.add_to_media(cdn_url, key) - - return cdn_url - - def get_wacz(self, url): - if not self.browsertrix.enabled: - logger.debug(f"Browsertrix WACZ generation is not enabled, skipping.") - return - if self.is_docker: - # TODO: figure out support for browsertrix in docker - # see: https://github.com/bellingcat/auto-archiver/issues/66 - logger.warning(f"Browsertrix WACZ is not yet supported when using DOCKER.") - return - - logger.debug(f"getting wacz for {url}") - key = self._get_key_from_url(url, ".wacz", append_datetime=True) - collection = re.sub('[^0-9a-zA-Z]+', '', key.replace(".wacz", "")) - - browsertrix_home = os.path.join(os.getcwd(), "browsertrix-tmp") - cmd = [ - "docker", "run", - "--rm", # delete container once it has completed running - "-v", f"{browsertrix_home}:/crawls/", - # "-it", # this leads to "the input device is not a TTY" - "webrecorder/browsertrix-crawler", "crawl", - "--url", url, - "--scopeType", "page", - "--generateWACZ", - "--text", - "--collection", collection, - "--behaviors", "autoscroll,autoplay,autofetch,siteSpecific", - "--behaviorTimeout", str(self.browsertrix.timeout_seconds), - "--timeout", str(self.browsertrix.timeout_seconds) - ] - - if not os.path.isdir(browsertrix_home): - os.mkdir(browsertrix_home) - - if self.browsertrix.profile: - shutil.copyfile(self.browsertrix.profile, os.path.join(browsertrix_home, "profile.tar.gz")) - cmd.extend(["--profile", "/crawls/profile.tar.gz"]) - - try: - logger.info(f"Running browsertrix-crawler: {' '.join(cmd)}") - subprocess.run(cmd, check=True) - except Exception as e: - logger.error(f"WACZ generation failed: {e}") - return - - filename = os.path.join(browsertrix_home, "collections", collection, f"{collection}.wacz") - - # do not crash if upload fails - try: - self.storage.upload(filename, key, extra_args={ - 'ACL': 'public-read', 'ContentType': 'application/zip'}) - except FileNotFoundError as e: - logger.warning(f"Unable to locate and upload WACZ {filename=}, {key=}") - - # clean up the local browsertrix files - try: - shutil.rmtree(browsertrix_home) - except PermissionError: - logger.warn(f"Unable to clean up browsertrix-crawler files in {browsertrix_home}") - - cdn_url = self.storage.get_cdn_url(key) - self.add_to_media(cdn_url, key) - return cdn_url - - def get_thumbnails(self, filename, key, duration=None): - thumbnails_folder = os.path.splitext(filename)[0] + os.path.sep - key_folder = key.split('.')[0] + os.path.sep - - mkdir_if_not_exists(thumbnails_folder) - - fps = 0.5 - if duration is not None: - duration = float(duration) - - if duration < 60: - fps = 10.0 / duration - elif duration < 120: - fps = 20.0 / duration - else: - fps = 40.0 / duration - - stream = ffmpeg.input(filename) - stream = ffmpeg.filter(stream, 'fps', fps=fps).filter('scale', 512, -1) - stream.output(thumbnails_folder + 'out%d.jpg').run() - - thumbnails = os.listdir(thumbnails_folder) - cdn_urls = [] - for fname in thumbnails: - if fname[-3:] == 'jpg': - thumbnail_filename = thumbnails_folder + fname - key = os.path.join(key_folder, fname) - - self.storage.upload(thumbnail_filename, key) - cdn_url = self.storage.get_cdn_url(key) - cdn_urls.append(cdn_url) - - if len(cdn_urls) == 0: - return ('', '') - - key_thumb = cdn_urls[int(len(cdn_urls) * 0.1)] - - index_page = f'''{filename} - ''' - - for t in cdn_urls: - index_page += f'' - - index_page += f"" - index_fname = thumbnails_folder + 'index.html' - - with open(index_fname, 'w') as f: - f.write(index_page) - - thumb_index = key_folder + 'index.html' - - self.storage.upload(index_fname, thumb_index, extra_args={ - 'ACL': 'public-read', 'ContentType': 'text/html'}) - shutil.rmtree(thumbnails_folder) - - thumb_index_cdn_url = self.storage.get_cdn_url(thumb_index) - - return (key_thumb, thumb_index_cdn_url) - - def signal_retry_in(self, min_seconds=1800, max_seconds=7200, **kwargs): - """ - sets state to retry in random between (min_seconds, max_seconds) - """ - now = datetime.datetime.now().timestamp() - retry_at = int(now + randrange(min_seconds, max_seconds)) - logger.debug(f"signaling {retry_at=}") - return ArchiveResult(status=f'retrying at {retry_at}', **kwargs) - - def is_retry(status): - return re.search(Archiver.retry_regex, status) is not None - - def should_retry_from_status(status): - """ - checks status against message in signal_retry_in - returns true if enough time has elapsed, false otherwise - """ - match = re.search(Archiver.retry_regex, status) - if match: - retry_at = int(match.group(1)) - now = datetime.datetime.now().timestamp() - should_retry = now >= retry_at - logger.debug(f"{should_retry=} since {now=} and {retry_at=}") - return should_retry - return False - - def remove_retry(status): - """ - transforms the status from retry into something else - """ - new_status = re.sub(Archiver.retry_regex, "failed: too many retries", status, 0) - logger.debug(f"removing retry message at {status=}, got {new_status=}") - return new_status diff --git a/src/auto_archiver/archivers/instagram_archiverv2.py b/src/auto_archiver/archivers/instagram_archiver.py similarity index 99% rename from src/auto_archiver/archivers/instagram_archiverv2.py rename to src/auto_archiver/archivers/instagram_archiver.py index 126622a..c0d1b2c 100644 --- a/src/auto_archiver/archivers/instagram_archiverv2.py +++ b/src/auto_archiver/archivers/instagram_archiver.py @@ -2,11 +2,11 @@ import re, os, shutil, html, traceback import instaloader # https://instaloader.github.io/as-module.html from loguru import logger -from . import Archiverv2 +from . import Archiver from ..core import Metadata from ..core import Media -class InstagramArchiver(Archiverv2): +class InstagramArchiver(Archiver): """ Uses Instaloader to download either a post (inc images, videos, text) or as much as possible from a profile (posts, stories, highlights, ...) """ diff --git a/src/auto_archiver/archivers/telegram_archiverv2.py b/src/auto_archiver/archivers/telegram_archiver.py similarity index 97% rename from src/auto_archiver/archivers/telegram_archiverv2.py rename to src/auto_archiver/archivers/telegram_archiver.py index 2f4bf23..b579dc4 100644 --- a/src/auto_archiver/archivers/telegram_archiverv2.py +++ b/src/auto_archiver/archivers/telegram_archiver.py @@ -4,12 +4,12 @@ import html from bs4 import BeautifulSoup from loguru import logger -from . import Archiverv2 +from . import Archiver from ..core import Metadata from ..core import Media -class TelegramArchiver(Archiverv2): +class TelegramArchiver(Archiver): """ Archiver for telegram that does not require login, but the telethon_archiver is much more advised, will only return if at least one image or one video is found """ diff --git a/src/auto_archiver/archivers/telethon_archiverv2.py b/src/auto_archiver/archivers/telethon_archiver.py similarity index 99% rename from src/auto_archiver/archivers/telethon_archiverv2.py rename to src/auto_archiver/archivers/telethon_archiver.py index 57d27e1..20bf28e 100644 --- a/src/auto_archiver/archivers/telethon_archiverv2.py +++ b/src/auto_archiver/archivers/telethon_archiver.py @@ -8,12 +8,12 @@ from loguru import logger from tqdm import tqdm import re, time, json, os -from . import Archiverv2 +from . import Archiver from ..core import Metadata from ..core import Media -class TelethonArchiver(Archiverv2): +class TelethonArchiver(Archiver): name = "telethon_archiver" link_pattern = re.compile(r"https:\/\/t\.me(\/c){0,1}\/(.+)\/(\d+)") invite_pattern = re.compile(r"t.me(\/joinchat){0,1}\/\+?(.+)") diff --git a/src/auto_archiver/archivers/tiktok_archiverv2.py b/src/auto_archiver/archivers/tiktok_archiver.py similarity index 96% rename from src/auto_archiver/archivers/tiktok_archiverv2.py rename to src/auto_archiver/archivers/tiktok_archiver.py index ea7f670..0c41193 100644 --- a/src/auto_archiver/archivers/tiktok_archiverv2.py +++ b/src/auto_archiver/archivers/tiktok_archiver.py @@ -5,12 +5,12 @@ import uuid import tiktok_downloader from loguru import logger -from . import Archiverv2 +from . import Archiver from ..core import Metadata from ..core import Media -class TiktokArchiver(Archiverv2): +class TiktokArchiver(Archiver): name = "tiktok_archiver" def __init__(self, config: dict) -> None: diff --git a/src/auto_archiver/archivers/twitter_api_archiverv2.py b/src/auto_archiver/archivers/twitter_api_archiver.py similarity index 97% rename from src/auto_archiver/archivers/twitter_api_archiverv2.py rename to src/auto_archiver/archivers/twitter_api_archiver.py index 007193c..821d8d4 100644 --- a/src/auto_archiver/archivers/twitter_api_archiverv2.py +++ b/src/auto_archiver/archivers/twitter_api_archiver.py @@ -7,13 +7,13 @@ from loguru import logger from pytwitter import Api from slugify import slugify -from . import Archiverv2 -from .twitter_archiverv2 import TwitterArchiver +from . import Archiver +from .twitter_archiver import TwitterArchiver from ..core import Metadata from ..core import Media -class TwitterApiArchiver(TwitterArchiver, Archiverv2): +class TwitterApiArchiver(TwitterArchiver, Archiver): name = "twitter_api_archiver" def __init__(self, config: dict) -> None: diff --git a/src/auto_archiver/archivers/twitter_archiverv2.py b/src/auto_archiver/archivers/twitter_archiver.py similarity index 98% rename from src/auto_archiver/archivers/twitter_archiverv2.py rename to src/auto_archiver/archivers/twitter_archiver.py index 58942ab..194025d 100644 --- a/src/auto_archiver/archivers/twitter_archiverv2.py +++ b/src/auto_archiver/archivers/twitter_archiver.py @@ -7,11 +7,11 @@ from loguru import logger from snscrape.modules.twitter import TwitterTweetScraper, Video, Gif, Photo from slugify import slugify -from . import Archiverv2 +from . import Archiver from ..core import Metadata from ..core import Media -class TwitterArchiver(Archiverv2): +class TwitterArchiver(Archiver): """ This Twitter Archiver uses unofficial scraping methods. """ diff --git a/src/auto_archiver/archivers/vk_archiverv2.py b/src/auto_archiver/archivers/vk_archiver.py similarity index 97% rename from src/auto_archiver/archivers/vk_archiverv2.py rename to src/auto_archiver/archivers/vk_archiver.py index 4c2ad98..1d76282 100644 --- a/src/auto_archiver/archivers/vk_archiverv2.py +++ b/src/auto_archiver/archivers/vk_archiver.py @@ -2,12 +2,12 @@ from loguru import logger from vk_url_scraper import VkScraper from ..utils.misc import dump_payload -from . import Archiverv2 +from . import Archiver from ..core import Metadata from ..core import Media -class VkArchiver(Archiverv2): +class VkArchiver(Archiver): """" VK videos are handled by YTDownloader, this archiver gets posts text and images. Currently only works for /wall posts diff --git a/src/auto_archiver/archivers/youtubedl_archiverv2.py b/src/auto_archiver/archivers/youtubedl_archiver.py similarity index 97% rename from src/auto_archiver/archivers/youtubedl_archiverv2.py rename to src/auto_archiver/archivers/youtubedl_archiver.py index 0b5916c..443fb17 100644 --- a/src/auto_archiver/archivers/youtubedl_archiverv2.py +++ b/src/auto_archiver/archivers/youtubedl_archiver.py @@ -4,12 +4,12 @@ import os import yt_dlp from loguru import logger -from . import Archiverv2 +from . import Archiver from ..core import Metadata from ..core import Media -class YoutubeDLArchiver(Archiverv2): +class YoutubeDLArchiver(Archiver): name = "youtubedl_enricher" def __init__(self, config: dict) -> None: diff --git a/src/auto_archiver/auto_archive.py b/src/auto_archiver/auto_archive.py deleted file mode 100644 index a797405..0000000 --- a/src/auto_archiver/auto_archive.py +++ /dev/null @@ -1,176 +0,0 @@ -import os, datetime, traceback, random, tempfile - -from loguru import logger -from slugify import slugify -from urllib.parse import quote - -from archivers import TelethonArchiver, TelegramArchiver, TiktokArchiver, YoutubeDLArchiver, TwitterArchiver, TwitterApiArchiver, VkArchiver, WaybackArchiver, InstagramArchiver, ArchiveResult, Archiver -from utils import GWorksheet, expand_url -from configs import Config -from storages import Storage - -random.seed() - - -def update_sheet(gw, row, url, result: ArchiveResult): - cell_updates = [] - row_values = gw.get_row(row) - - def batch_if_valid(col, val, final_value=None): - final_value = final_value or val - if val and gw.col_exists(col) and gw.get_cell(row_values, col) == '': - cell_updates.append((row, col, final_value)) - - cell_updates.append((row, 'status', result.status)) - - batch_if_valid('archive', result.cdn_url) - batch_if_valid('date', True, datetime.datetime.utcnow().replace(tzinfo=datetime.timezone.utc).isoformat()) - batch_if_valid('thumbnail', result.thumbnail, f'=IMAGE("{result.thumbnail}")') - batch_if_valid('thumbnail_index', result.thumbnail_index) - batch_if_valid('title', result.title) - batch_if_valid('duration', result.duration, str(result.duration)) - batch_if_valid('screenshot', result.screenshot) - batch_if_valid('hash', result.hash) - if result.wacz is not None: - batch_if_valid('wacz', result.wacz) - batch_if_valid('replaywebpage', f'https://replayweb.page/?source={quote(result.wacz)}#view=pages&url={quote(url)}') - - if result.timestamp is not None: - if type(result.timestamp) == int: - timestamp_string = datetime.datetime.fromtimestamp(result.timestamp).replace(tzinfo=datetime.timezone.utc).isoformat() - elif type(result.timestamp) == str: - timestamp_string = result.timestamp - else: - timestamp_string = result.timestamp.isoformat() - - batch_if_valid('timestamp', timestamp_string) - - gw.batch_set_cell(cell_updates) - - -def missing_required_columns(gw: GWorksheet): - missing = False - for required_col in ['url', 'status']: - if not gw.col_exists(required_col): - logger.warning(f'Required column for {required_col}: "{gw.columns[required_col]}" not found, skipping worksheet {gw.wks.title}') - missing = True - return missing - - -def should_process_sheet(c: Config, sheet_name): - if len(c.worksheet_allow) and sheet_name not in c.worksheet_allow: - # ALLOW rules exist AND sheet name not explicitly allowed - return False - if len(c.worksheet_block) and sheet_name in c.worksheet_block: - # BLOCK rules exist AND sheet name is blocked - return False - return True - - -def archive_url(c: Config, url: str, folder: str, debug_string: str, is_retry: bool): - url = expand_url(url) - c.set_folder(folder) - storage = c.get_storage() - - # make a new driver so each spreadsheet row is idempotent - c.recreate_webdriver() - - # order matters, first to succeed excludes remaining - active_archivers = [ - TelethonArchiver(storage, c), - TiktokArchiver(storage, c), - TwitterApiArchiver(storage, c), - InstagramArchiver(storage, c), - YoutubeDLArchiver(storage, c), - TelegramArchiver(storage, c), - TwitterArchiver(storage, c), - VkArchiver(storage, c), - WaybackArchiver(storage, c) - ] - - for archiver in active_archivers: - logger.debug(f'Trying {archiver} on {debug_string}') - - try: - result = archiver.download(url, check_if_exists=c.check_if_exists) - except KeyboardInterrupt as e: raise e # so the higher level catch can catch it - except Exception as e: - result = False - logger.error(f'Got unexpected error in {debug_string} with {archiver.name} for {url=}: {e}\n{traceback.format_exc()}') - - if result: - success = result.status in ['success', 'already archived'] - result.status = f"{archiver.name}: {result.status}" - if success: - logger.success(f'{archiver.name} succeeded on {debug_string}, {url=}') - break - # only 1 retry possible for now - if is_retry and Archiver.is_retry(result.status): - result.status = Archiver.remove_retry(result.status) - logger.warning(f'{archiver.name} did not succeed on {debug_string}, final status: {result.status}') - return result - - -def process_sheet(c: Config): - sh = c.gsheets_client.open(c.sheet) - - # loop through worksheets to check - for ii, wks in enumerate(sh.worksheets()): - if not should_process_sheet(c, wks.title): - logger.info(f'Ignoring worksheet "{wks.title}" due to allow/block configurations') - continue - - logger.info(f'Opening worksheet {ii=}: {wks.title=} {c.header=}') - gw = GWorksheet(wks, header_row=c.header, columns=c.column_names) - - if missing_required_columns(gw): continue - - # archives will default to being in a folder 'doc_name/worksheet_name' - default_folder = os.path.join(slugify(c.sheet), slugify(wks.title)) - c.set_folder(default_folder) - storage = c.get_storage() - - # loop through rows in worksheet - for row in range(1 + c.header, gw.count_rows() + 1): - url = gw.get_cell(row, 'url') - original_status = gw.get_cell(row, 'status') - status = gw.get_cell(row, 'status', fresh=original_status in ['', None] and url != '') - - is_retry = False - if url == '' or status not in ['', None]: - is_retry = Archiver.should_retry_from_status(status) - if not is_retry: continue - - # All checks done - archival process starts here - try: - gw.set_cell(row, 'status', 'Archive in progress') - result = archive_url(c, url, gw.get_cell_or_default(row, 'folder', default_folder, when_empty_use_default=True), f"{row=}", is_retry=is_retry) - if result: - update_sheet(gw, row, url, result) - else: - gw.set_cell(row, 'status', 'failed: no archiver') - except KeyboardInterrupt: - # catches keyboard interruptions to do a clean exit - logger.warning(f"caught interrupt on {row=}, {url=}") - gw.set_cell(row, 'status', '') - c.destroy_webdriver() - exit() - except Exception as e: - logger.error(f'Got unexpected error in row {row} for {url=}: {e}\n{traceback.format_exc()}') - gw.set_cell(row, 'status', 'failed: unexpected error (see logs)') - logger.success(f'Finished worksheet {wks.title}') - - -@logger.catch -def main(): - c = Config() - c.parse() - logger.info(f'Opening document {c.sheet} for header {c.header}') - with tempfile.TemporaryDirectory(dir="./") as tmpdir: - Storage.TMP_FOLDER = tmpdir - process_sheet(c) - c.destroy_webdriver() - - -if __name__ == '__main__': - main() diff --git a/src/auto_archiver/auto_auto_archive.py b/src/auto_archiver/auto_auto_archive.py index 8c794c5..660be4f 100644 --- a/src/auto_archiver/auto_auto_archive.py +++ b/src/auto_archiver/auto_auto_archive.py @@ -1,29 +1,34 @@ -import tempfile -import auto_archive -from loguru import logger -from configs import Config -from storages import Storage + +#TODO: refactor GDriveStorage before merging to main +# is it possible to have something like this with the new pipeline? -def main(): - c = Config() - c.parse() - logger.info(f'Opening document {c.sheet} to look for sheet names to archive') - - gc = c.gsheets_client - sh = gc.open(c.sheet) - - wks = sh.get_worksheet(0) - values = wks.get_all_values() - - with tempfile.TemporaryDirectory(dir="./") as tmpdir: - Storage.TMP_FOLDER = tmpdir - for i in range(11, len(values)): - c.sheet = values[i][0] - logger.info(f"Processing {c.sheet}") - auto_archive.process_sheet(c) - c.destroy_webdriver() +# # import tempfile +# import auto_archive +# from loguru import logger +# from configs import Config +# from storages import Storage -if __name__ == "__main__": - main() +# def main(): +# c = Config() +# c.parse() +# logger.info(f'Opening document {c.sheet} to look for sheet names to archive') + +# gc = c.gsheets_client +# sh = gc.open(c.sheet) + +# wks = sh.get_worksheet(0) +# values = wks.get_all_values() + +# with tempfile.TemporaryDirectory(dir="./") as tmpdir: +# Storage.TMP_FOLDER = tmpdir +# for i in range(11, len(values)): +# c.sheet = values[i][0] +# logger.info(f"Processing {c.sheet}") +# auto_archive.process_sheet(c) +# c.destroy_webdriver() + + +# if __name__ == "__main__": +# main() diff --git a/src/auto_archiver/core/__init__.py b/src/auto_archiver/core/__init__.py index a171478..04f381e 100644 --- a/src/auto_archiver/core/__init__.py +++ b/src/auto_archiver/core/__init__.py @@ -4,4 +4,4 @@ from .step import Step # cannot import ArchivingOrchestrator/Config to avoid circular dep # from .orchestrator import ArchivingOrchestrator -# from .v2config import ConfigV2 \ No newline at end of file +# from .config import Config \ No newline at end of file diff --git a/src/auto_archiver/core/v2config.py b/src/auto_archiver/core/config.py similarity index 93% rename from src/auto_archiver/core/v2config.py rename to src/auto_archiver/core/config.py index 3db6618..73bd585 100644 --- a/src/auto_archiver/core/v2config.py +++ b/src/auto_archiver/core/config.py @@ -5,31 +5,31 @@ from dataclasses import dataclass, field from typing import List from collections import defaultdict -from ..archivers import Archiverv2 +from ..archivers import Archiver from ..feeders import Feeder from ..databases import Database from ..formatters import Formatter -from ..storages import StorageV2 +from ..storages import Storage from . import Step from ..enrichers import Enricher @dataclass -class ConfigV2: +class Config: # TODO: should Config inherit from Step so it can have it's own configurations? # these are only detected if they are put to the respective __init__.py configurable_parents = [ Feeder, Enricher, - Archiverv2, + Archiver, Database, - StorageV2, + Storage, Formatter # Util ] feeder: Step # TODO:= BaseFeeder formatter: Formatter - archivers: List[Archiverv2] = field(default_factory=[]) # TODO: fix type + archivers: List[Archiver] = field(default_factory=[]) # TODO: fix type enrichers: List[Enricher] = field(default_factory=[]) storages: List[Step] = field(default_factory=[]) # TODO: fix type databases: List[Database] = field(default_factory=[]) @@ -107,9 +107,9 @@ class ConfigV2: self.feeder = Feeder.init(steps.get("feeder", "cli_feeder"), self.config) self.formatter = Formatter.init(steps.get("formatter", "html_formatter"), self.config) self.enrichers = [Enricher.init(e, self.config) for e in steps.get("enrichers", [])] - self.archivers = [Archiverv2.init(e, self.config) for e in (steps.get("archivers") or [])] + self.archivers = [Archiver.init(e, self.config) for e in (steps.get("archivers") or [])] self.databases = [Database.init(e, self.config) for e in steps.get("databases", [])] - self.storages = [StorageV2.init(e, self.config) for e in steps.get("storages", [])] + self.storages = [Storage.init(e, self.config) for e in steps.get("storages", [])] print("feeder", self.feeder) print("enrichers", [e for e in self.enrichers]) diff --git a/src/auto_archiver/core/metadata.py b/src/auto_archiver/core/metadata.py index bb3079c..f09d7e0 100644 --- a/src/auto_archiver/core/metadata.py +++ b/src/auto_archiver/core/metadata.py @@ -4,7 +4,7 @@ from ast import List, Set from typing import Any, Union, Dict from dataclasses import dataclass, field from dataclasses_json import dataclass_json -import datetime, mimetypes +import datetime from urllib.parse import urlparse from loguru import logger from dateutil.parser import parse as parse_dt @@ -17,7 +17,7 @@ class Metadata: status: str = "no archiver" _processed_at: datetime = datetime.datetime.utcnow().replace(tzinfo=datetime.timezone.utc) metadata: Dict[str, Any] = field(default_factory=dict) - tmp_keys: Set[str] = field(default_factory=set, repr=False) # keys that are not to be saved in DBs + tmp_keys: Set[str] = field(default_factory=set, repr=False, metadata={"exclude":True}) # keys that are not to be saved in DBs media: List[Media] = field(default_factory=list) final_media: Media = None # can be overwritten by formatters rearchivable: bool = False diff --git a/src/auto_archiver/core/orchestrator.py b/src/auto_archiver/core/orchestrator.py index 19a35aa..bb7902e 100644 --- a/src/auto_archiver/core/orchestrator.py +++ b/src/auto_archiver/core/orchestrator.py @@ -3,10 +3,10 @@ from ast import List from typing import Union, Dict from dataclasses import dataclass -from ..archivers import Archiverv2 +from ..archivers import Archiver from ..feeders import Feeder from ..formatters import Formatter -from ..storages import StorageV2 +from ..storages import Storage from ..enrichers import Enricher from ..databases import Database from .media import Media @@ -59,9 +59,9 @@ class ArchivingOrchestrator: self.feeder: Feeder = config.feeder self.formatter: Formatter = config.formatter self.enrichers = config.enrichers - self.archivers: List[Archiverv2] = config.archivers + self.archivers: List[Archiver] = config.archivers self.databases: List[Database] = config.databases - self.storages: List[StorageV2] = config.storages + self.storages: List[Storage] = config.storages for a in self.archivers: a.setup() @@ -69,12 +69,12 @@ class ArchivingOrchestrator: for item in self.feeder: self.feed_item(item) - def feed_item(self, item:Metadata) -> Metadata: + def feed_item(self, item: Metadata) -> Metadata: print("ARCHIVING", item) try: with tempfile.TemporaryDirectory(dir="./") as tmp_dir: item.set_tmp_dir(tmp_dir) - result = self.archive(item) + return self.archive(item) except KeyboardInterrupt: # catches keyboard interruptions to do a clean exit logger.warning(f"caught interrupt on {item=}") @@ -84,8 +84,6 @@ class ArchivingOrchestrator: logger.error(f'Got unexpected error on item {item}: {e}\n{traceback.format_exc()}') for d in self.databases: d.failed(item) - return result - # how does this handle the parameters like folder which can be different for each archiver? # the storage needs to know where to archive!! # solution: feeders have context: extra metadata that they can read or ignore, @@ -154,7 +152,7 @@ class ArchivingOrchestrator: for prop in m.properties.values(): if isinstance(prop, Media): s.store(prop, result) - if isinstance(prop, list) and len(prop)>0 and isinstance(prop[0], Media): + if isinstance(prop, list) and len(prop) > 0 and isinstance(prop[0], Media): for prop_media in prop: s.store(prop_media, result) diff --git a/src/auto_archiver/core/step.py b/src/auto_archiver/core/step.py index ae10869..4c04c94 100644 --- a/src/auto_archiver/core/step.py +++ b/src/auto_archiver/core/step.py @@ -28,7 +28,7 @@ class Step(ABC): for sub in child.__subclasses__(): if sub.name == name: return sub(config) - raise ClassFoundException(f"Unable to initialize STEP with {name=}, check your configuration file/step names.") + raise ClassFoundException(f"Unable to initialize STEP with {name=}, check your configuration file/step names, and make sure you made the step discoverable by putting it into __init__.py") def assert_valid_string(self, prop: str) -> None: """ diff --git a/src/auto_archiver/databases/__init__.py b/src/auto_archiver/databases/__init__.py index 3a3e907..1e676ea 100644 --- a/src/auto_archiver/databases/__init__.py +++ b/src/auto_archiver/databases/__init__.py @@ -1,3 +1,4 @@ from .database import Database from .gsheet_db import GsheetsDb -from .console_db import ConsoleDb \ No newline at end of file +from .console_db import ConsoleDb +from .csv_db import CSVDb \ No newline at end of file diff --git a/src/auto_archiver/databases/csv_db.py b/src/auto_archiver/databases/csv_db.py new file mode 100644 index 0000000..72c804b --- /dev/null +++ b/src/auto_archiver/databases/csv_db.py @@ -0,0 +1,33 @@ +import os +from loguru import logger + +from . import Database +from ..core import Metadata +from csv import DictWriter +from dataclasses import asdict + +class CSVDb(Database): + """ + Outputs results to a CSV file + """ + name = "csv_db" + + def __init__(self, config: dict) -> None: + # without this STEP.__init__ is not called + super().__init__(config) + self.assert_valid_string("csv_file") + + @staticmethod + def configs() -> dict: + return { + "csv_file": {"default": "db.csv", "help": "CSV file name"} + } + + def done(self, item: Metadata) -> None: + """archival result ready - should be saved to DB""" + logger.success(f"DONE {item}") + is_empty = not os.path.isfile(self.csv_file) or os.path.getsize(self.csv_file) == 0 + with open(self.csv_file, "a", encoding="utf-8") as outf: + writer = DictWriter(outf, fieldnames=asdict(Metadata())) + if is_empty: writer.writeheader() + writer.writerow(asdict(item)) diff --git a/src/auto_archiver/databases/database.py b/src/auto_archiver/databases/database.py index 3f09c4a..133afaa 100644 --- a/src/auto_archiver/databases/database.py +++ b/src/auto_archiver/databases/database.py @@ -18,7 +18,6 @@ class Database(Step, ABC): # only for typing... return Step.init(name, config, Database) - @abstractmethod def started(self, item: Metadata) -> None: """signals the DB that the given item archival has started""" pass diff --git a/src/auto_archiver/enrichers/wayback_enricher.py b/src/auto_archiver/enrichers/wayback_enricher.py index 945b148..187c3fe 100644 --- a/src/auto_archiver/enrichers/wayback_enricher.py +++ b/src/auto_archiver/enrichers/wayback_enricher.py @@ -2,10 +2,10 @@ from loguru import logger import time, requests from . import Enricher -from ..archivers import Archiverv2 +from ..archivers import Archiver from ..core import Metadata -class WaybackArchiverEnricher(Enricher, Archiverv2): +class WaybackArchiverEnricher(Enricher, Archiver): """ Submits the current URL to the webarchive and returns a job_id or completed archive """ diff --git a/src/auto_archiver/feeders/__init__.py b/src/auto_archiver/feeders/__init__.py index b11cd50..f42f98f 100644 --- a/src/auto_archiver/feeders/__init__.py +++ b/src/auto_archiver/feeders/__init__.py @@ -1,2 +1,3 @@ from.feeder import Feeder -from .gsheet_feeder import GsheetsFeeder \ No newline at end of file +from .gsheet_feeder import GsheetsFeeder +from .cli_feeder import CLIFeeder \ No newline at end of file diff --git a/src/auto_archiver/feeders/cli_feeder.py b/src/auto_archiver/feeders/cli_feeder.py new file mode 100644 index 0000000..6d52085 --- /dev/null +++ b/src/auto_archiver/feeders/cli_feeder.py @@ -0,0 +1,33 @@ +import gspread, os + +# from metadata import Metadata +from loguru import logger + +# from . import Enricher +from . import Feeder +from ..core import Metadata + + +class CLIFeeder(Feeder): + name = "cli_feeder" + + def __init__(self, config: dict) -> None: + # without this STEP.__init__ is not called + super().__init__(config) + assert type(self.urls) == list and len(self.urls) > 0, "Please provide a CSV list of URL(s) to process, with --cli_feeder.urls='url1,url2,url3'" + + @staticmethod + def configs() -> dict: + return { + "urls": { + "default": None, + "help": "URL(s) to archive, either a single URL or a list of urls, should not come from config.yaml", + "cli_set": lambda cli_val, cur_val: list(set(cli_val.split(","))) + }, + } + + def __iter__(self) -> Metadata: + for url in self.urls: + logger.debug(f"Processing {url}") + yield Metadata().set_url(url).set("folder", "cli", True) + logger.success(f"Processed {len(self.urls)} URL(s)") diff --git a/src/auto_archiver/storages/__init__.py b/src/auto_archiver/storages/__init__.py index 7094d25..250004a 100644 --- a/src/auto_archiver/storages/__init__.py +++ b/src/auto_archiver/storages/__init__.py @@ -1,9 +1,3 @@ -# we need to explicitly expose the available imports here -from .base_storage import Storage -# from .local_storage import LocalStorage, LocalConfig -# from .s3_storage import S3Config, S3Storage -# from .gd_storage import GDConfig, GDStorage - -from .storage import StorageV2 -from .s3 import S3StorageV2 -from .local import LocalStorageV2 \ No newline at end of file +from .storage import Storage +from .s3 import S3Storage +from .local import LocalStorage \ No newline at end of file diff --git a/src/auto_archiver/storages/base_storage.py b/src/auto_archiver/storages/base_storage.py deleted file mode 100644 index f147678..0000000 --- a/src/auto_archiver/storages/base_storage.py +++ /dev/null @@ -1,33 +0,0 @@ -import os, uuid -from loguru import logger -from abc import ABC, abstractmethod -from pathlib import Path - - -class Storage(ABC): - TMP_FOLDER = "tmp/" - - @abstractmethod - def __init__(self, config): pass - - @abstractmethod - def get_cdn_url(self, key): pass - - @abstractmethod - def exists(self, key): pass - - @abstractmethod - def uploadf(self, file, key, **kwargs): pass - - def clean_key(self, key): - # Some storages does not work well with trailing forward slashes and some keys come with that - if key.startswith('/'): - logger.debug(f'Found and fixed a leading "/" for {key=}') - return key[1:] - return key - - - def upload(self, filename: str, key: str, **kwargs): - logger.debug(f'[{self.__class__.__name__}] uploading file {filename} with key {key}') - with open(filename, 'rb') as f: - self.uploadf(f, key, **kwargs) diff --git a/src/auto_archiver/storages/gd_storage.py b/src/auto_archiver/storages/gd_storage.py index 3af77f1..09c8938 100644 --- a/src/auto_archiver/storages/gd_storage.py +++ b/src/auto_archiver/storages/gd_storage.py @@ -1,178 +1,181 @@ -import os, time -from loguru import logger -from .base_storage import Storage -from dataclasses import dataclass -from googleapiclient.discovery import build -from googleapiclient.http import MediaFileUpload -from google.oauth2 import service_account +#TODO: refactor GDriveStorage before merging to main + +# import os, time + +# from loguru import logger +# from .base_storage import Storage +# from dataclasses import dataclass +# from googleapiclient.discovery import build +# from googleapiclient.http import MediaFileUpload +# from google.oauth2 import service_account -from google.oauth2.credentials import Credentials -from google.auth.transport.requests import Request +# from google.oauth2.credentials import Credentials +# from google.auth.transport.requests import Request -@dataclass -class GDConfig: - root_folder_id: str - oauth_token_filename: str - service_account: str = "service_account.json" - folder: str = "default" +# @dataclass +# class GDConfig: +# root_folder_id: str +# oauth_token_filename: str +# service_account: str = "service_account.json" +# folder: str = "default" -class GDStorage(Storage): - def __init__(self, config: GDConfig): - self.folder = config.folder - self.root_folder_id = config.root_folder_id +# class GDStorage(Storage): +# def __init__(self, config: GDConfig): +# self.folder = config.folder +# self.root_folder_id = config.root_folder_id - SCOPES=['https://www.googleapis.com/auth/drive'] +# SCOPES=['https://www.googleapis.com/auth/drive'] - token_file = config.oauth_token_filename - if token_file is not None: - """ - Tokens are refreshed after 1 hour - however keep working for 7 days (tbc) - so as long as the job doesn't last for 7 days - then this method of refreshing only once per run will work - see this link for details on the token - https://davemateer.com/2022/04/28/google-drive-with-python#tokens - """ - logger.debug(f'Using GD OAuth token {token_file}') - creds = Credentials.from_authorized_user_file(token_file, SCOPES) +# token_file = config.oauth_token_filename +# if token_file is not None: +# """ +# Tokens are refreshed after 1 hour +# however keep working for 7 days (tbc) +# so as long as the job doesn't last for 7 days +# then this method of refreshing only once per run will work +# see this link for details on the token +# https://davemateer.com/2022/04/28/google-drive-with-python#tokens +# """ +# logger.debug(f'Using GD OAuth token {token_file}') +# creds = Credentials.from_authorized_user_file(token_file, SCOPES) - if not creds or not creds.valid: - if creds and creds.expired and creds.refresh_token: - logger.debug('Requesting new GD OAuth token') - creds.refresh(Request()) - else: - raise Exception("Problem with creds - create the token again") +# if not creds or not creds.valid: +# if creds and creds.expired and creds.refresh_token: +# logger.debug('Requesting new GD OAuth token') +# creds.refresh(Request()) +# else: +# raise Exception("Problem with creds - create the token again") - # Save the credentials for the next run - with open(token_file, 'w') as token: - logger.debug('Saving new GD OAuth token') - token.write(creds.to_json()) - else: - logger.debug('GD OAuth Token valid') - else: - gd_service_account = config.service_account - logger.debug(f'Using GD Service Account {gd_service_account}') - creds = service_account.Credentials.from_service_account_file(gd_service_account, scopes=SCOPES) +# # Save the credentials for the next run +# with open(token_file, 'w') as token: +# logger.debug('Saving new GD OAuth token') +# token.write(creds.to_json()) +# else: +# logger.debug('GD OAuth Token valid') +# else: +# gd_service_account = config.service_account +# logger.debug(f'Using GD Service Account {gd_service_account}') +# creds = service_account.Credentials.from_service_account_file(gd_service_account, scopes=SCOPES) - self.service = build('drive', 'v3', credentials=creds) +# self.service = build('drive', 'v3', credentials=creds) - def get_cdn_url(self, key): - """ - only support files saved in a folder for GD - S3 supports folder and all stored in the root - """ - key = self.clean_key(key) +# def get_cdn_url(self, key): +# """ +# only support files saved in a folder for GD +# S3 supports folder and all stored in the root +# """ +# key = self.clean_key(key) - full_name = os.path.join(self.folder, key) - parent_id, folder_id = self.root_folder_id, None - path_parts = full_name.split(os.path.sep) - filename = path_parts[-1] - logger.info(f"looking for folders for {path_parts[0:-1]} before uploading {filename=}") - for folder in path_parts[0:-1]: - folder_id = self._get_id_from_parent_and_name(parent_id, folder, use_mime_type=True, raise_on_missing=True) - parent_id = folder_id +# full_name = os.path.join(self.folder, key) +# parent_id, folder_id = self.root_folder_id, None +# path_parts = full_name.split(os.path.sep) +# filename = path_parts[-1] +# logger.info(f"looking for folders for {path_parts[0:-1]} before uploading {filename=}") +# for folder in path_parts[0:-1]: +# folder_id = self._get_id_from_parent_and_name(parent_id, folder, use_mime_type=True, raise_on_missing=True) +# parent_id = folder_id - # get id of file inside folder (or sub folder) - file_id = self._get_id_from_parent_and_name(folder_id, filename) - return f"https://drive.google.com/file/d/{file_id}/view?usp=sharing" +# # get id of file inside folder (or sub folder) +# file_id = self._get_id_from_parent_and_name(folder_id, filename) +# return f"https://drive.google.com/file/d/{file_id}/view?usp=sharing" - def exists(self, key): - try: - self.get_cdn_url(key) - return True - except: return False +# def exists(self, key): +# try: +# self.get_cdn_url(key) +# return True +# except: return False - def uploadf(self, file: str, key: str, **_kwargs): - """ - 1. for each sub-folder in the path check if exists or create - 2. upload file to root_id/other_paths.../filename - """ - key = self.clean_key(key) +# def uploadf(self, file: str, key: str, **_kwargs): +# """ +# 1. for each sub-folder in the path check if exists or create +# 2. upload file to root_id/other_paths.../filename +# """ +# key = self.clean_key(key) - full_name = os.path.join(self.folder, key) - parent_id, upload_to = self.root_folder_id, None - path_parts = full_name.split(os.path.sep) - filename = path_parts[-1] - logger.info(f"checking folders {path_parts[0:-1]} exist (or creating) before uploading {filename=}") - for folder in path_parts[0:-1]: - upload_to = self._get_id_from_parent_and_name(parent_id, folder, use_mime_type=True, raise_on_missing=False) - if upload_to is None: - upload_to = self._mkdir(folder, parent_id) - parent_id = upload_to +# full_name = os.path.join(self.folder, key) +# parent_id, upload_to = self.root_folder_id, None +# path_parts = full_name.split(os.path.sep) +# filename = path_parts[-1] +# logger.info(f"checking folders {path_parts[0:-1]} exist (or creating) before uploading {filename=}") +# for folder in path_parts[0:-1]: +# upload_to = self._get_id_from_parent_and_name(parent_id, folder, use_mime_type=True, raise_on_missing=False) +# if upload_to is None: +# upload_to = self._mkdir(folder, parent_id) +# parent_id = upload_to - # upload file to gd - logger.debug(f'uploading {filename=} to folder id {upload_to}') - file_metadata = { - 'name': [filename], - 'parents': [upload_to] - } - media = MediaFileUpload(file, resumable=True) - gd_file = self.service.files().create(body=file_metadata, media_body=media, fields='id').execute() - logger.debug(f'uploadf: uploaded file {gd_file["id"]} succesfully in folder={upload_to}') +# # upload file to gd +# logger.debug(f'uploading {filename=} to folder id {upload_to}') +# file_metadata = { +# 'name': [filename], +# 'parents': [upload_to] +# } +# media = MediaFileUpload(file, resumable=True) +# gd_file = self.service.files().create(body=file_metadata, media_body=media, fields='id').execute() +# logger.debug(f'uploadf: uploaded file {gd_file["id"]} succesfully in folder={upload_to}') - def upload(self, filename: str, key: str, **kwargs): - # GD only requires the filename not a file reader - self.uploadf(filename, key, **kwargs) +# def upload(self, filename: str, key: str, **kwargs): +# # GD only requires the filename not a file reader +# self.uploadf(filename, key, **kwargs) - # gets the Drive folderID if it is there - def _get_id_from_parent_and_name(self, parent_id: str, name: str, retries: int = 1, sleep_seconds: int = 10, use_mime_type: bool = False, raise_on_missing: bool = True, use_cache=False): - """ - Retrieves the id of a folder or file from its @name and the @parent_id folder - Optionally does multiple @retries and sleeps @sleep_seconds between them - If @use_mime_type will restrict search to "mimeType='application/vnd.google-apps.folder'" - If @raise_on_missing will throw error when not found, or returns None - Will remember previous calls to avoid duplication if @use_cache - might not have all edge cases tested, so use at own risk - Returns the id of the file or folder from its name as a string - """ - # cache logic - if use_cache: - self.api_cache = getattr(self, "api_cache", {}) - cache_key = f"{parent_id}_{name}_{use_mime_type}" - if cache_key in self.api_cache: - logger.debug(f"cache hit for {cache_key=}") - return self.api_cache[cache_key] +# # gets the Drive folderID if it is there +# def _get_id_from_parent_and_name(self, parent_id: str, name: str, retries: int = 1, sleep_seconds: int = 10, use_mime_type: bool = False, raise_on_missing: bool = True, use_cache=False): +# """ +# Retrieves the id of a folder or file from its @name and the @parent_id folder +# Optionally does multiple @retries and sleeps @sleep_seconds between them +# If @use_mime_type will restrict search to "mimeType='application/vnd.google-apps.folder'" +# If @raise_on_missing will throw error when not found, or returns None +# Will remember previous calls to avoid duplication if @use_cache - might not have all edge cases tested, so use at own risk +# Returns the id of the file or folder from its name as a string +# """ +# # cache logic +# if use_cache: +# self.api_cache = getattr(self, "api_cache", {}) +# cache_key = f"{parent_id}_{name}_{use_mime_type}" +# if cache_key in self.api_cache: +# logger.debug(f"cache hit for {cache_key=}") +# return self.api_cache[cache_key] - # API logic - debug_header: str = f"[searching {name=} in {parent_id=}]" - query_string = f"'{parent_id}' in parents and name = '{name}' and trashed = false " - if use_mime_type: - query_string += f" and mimeType='application/vnd.google-apps.folder' " +# # API logic +# debug_header: str = f"[searching {name=} in {parent_id=}]" +# query_string = f"'{parent_id}' in parents and name = '{name}' and trashed = false " +# if use_mime_type: +# query_string += f" and mimeType='application/vnd.google-apps.folder' " - for attempt in range(retries): - results = self.service.files().list( - q=query_string, - spaces='drive', # ie not appDataFolder or photos - fields='files(id, name)' - ).execute() - items = results.get('files', []) +# for attempt in range(retries): +# results = self.service.files().list( +# q=query_string, +# spaces='drive', # ie not appDataFolder or photos +# fields='files(id, name)' +# ).execute() +# items = results.get('files', []) - if len(items) > 0: - logger.debug(f"{debug_header} found {len(items)} matches, returning last of {','.join([i['id'] for i in items])}") - _id = items[-1]['id'] - if use_cache: self.api_cache[cache_key] = _id - return _id - else: - logger.debug(f'{debug_header} not found, attempt {attempt+1}/{retries}.') - if attempt < retries - 1: - logger.debug(f'sleeping for {sleep_seconds} second(s)') - time.sleep(sleep_seconds) +# if len(items) > 0: +# logger.debug(f"{debug_header} found {len(items)} matches, returning last of {','.join([i['id'] for i in items])}") +# _id = items[-1]['id'] +# if use_cache: self.api_cache[cache_key] = _id +# return _id +# else: +# logger.debug(f'{debug_header} not found, attempt {attempt+1}/{retries}.') +# if attempt < retries - 1: +# logger.debug(f'sleeping for {sleep_seconds} second(s)') +# time.sleep(sleep_seconds) - if raise_on_missing: - raise ValueError(f'{debug_header} not found after {retries} attempt(s)') - return None +# if raise_on_missing: +# raise ValueError(f'{debug_header} not found after {retries} attempt(s)') +# return None - def _mkdir(self, name: str, parent_id: str): - """ - Creates a new GDrive folder @name inside folder @parent_id - Returns id of the created folder - """ - logger.debug(f'Creating new folder with {name=} inside {parent_id=}') - file_metadata = { - 'name': [name], - 'mimeType': 'application/vnd.google-apps.folder', - 'parents': [parent_id] - } - gd_folder = self.service.files().create(body=file_metadata, fields='id').execute() - return gd_folder.get('id') +# def _mkdir(self, name: str, parent_id: str): +# """ +# Creates a new GDrive folder @name inside folder @parent_id +# Returns id of the created folder +# """ +# logger.debug(f'Creating new folder with {name=} inside {parent_id=}') +# file_metadata = { +# 'name': [name], +# 'mimeType': 'application/vnd.google-apps.folder', +# 'parents': [parent_id] +# } +# gd_folder = self.service.files().create(body=file_metadata, fields='id').execute() +# return gd_folder.get('id') diff --git a/src/auto_archiver/storages/local.py b/src/auto_archiver/storages/local.py index 1d44e6f..f5768a9 100644 --- a/src/auto_archiver/storages/local.py +++ b/src/auto_archiver/storages/local.py @@ -8,10 +8,10 @@ from slugify import slugify from ..core import Metadata from ..core import Media -from ..storages import StorageV2 +from ..storages import Storage -class LocalStorageV2(StorageV2): +class LocalStorage(Storage): name = "local_storage" def __init__(self, config: dict) -> None: diff --git a/src/auto_archiver/storages/s3.py b/src/auto_archiver/storages/s3.py index 25eb324..a88e2b2 100644 --- a/src/auto_archiver/storages/s3.py +++ b/src/auto_archiver/storages/s3.py @@ -4,12 +4,12 @@ import boto3, uuid, os, mimetypes from botocore.errorfactory import ClientError from ..core import Metadata from ..core import Media -from ..storages import StorageV2 +from ..storages import Storage from loguru import logger from slugify import slugify -class S3StorageV2(StorageV2): +class S3Storage(Storage): name = "s3_storage" def __init__(self, config: dict) -> None: diff --git a/src/auto_archiver/storages/s3_storage.py b/src/auto_archiver/storages/s3_storage.py deleted file mode 100644 index 563d2ea..0000000 --- a/src/auto_archiver/storages/s3_storage.py +++ /dev/null @@ -1,80 +0,0 @@ -import uuid, os, mimetypes -from dataclasses import dataclass - -import boto3 -from botocore.errorfactory import ClientError - -from .base_storage import Storage -from dataclasses import dataclass -from loguru import logger - - -@dataclass -class S3Config: - bucket: str - region: str - key: str - secret: str - folder: str = "" - endpoint_url: str = "https://{region}.digitaloceanspaces.com" - cdn_url: str = "https://{bucket}.{region}.cdn.digitaloceanspaces.com/{key}" - private: bool = False - key_path: str = "default" # 'default' uses full naming, 'random' uses generated uuid - - -class S3Storage(Storage): - - def __init__(self, config: S3Config): - self.bucket = config.bucket - self.region = config.region - self.folder = config.folder - self.private = config.private - self.cdn_url = config.cdn_url - self.key_path = config.key_path - self.key_dict = {} - - self.s3 = boto3.client( - 's3', - region_name=config.region, - endpoint_url=config.endpoint_url.format(region=config.region), - aws_access_key_id=config.key, - aws_secret_access_key=config.secret - ) - - def _get_path(self, key): - """ - Depends on the self.key_path configuration: - * random - assigns a random UUID which can be used in conjunction with "private=false" to have unguessable documents publicly available -> self.folder/randomUUID - * default -> defaults to self.folder/key - """ - # defaults to /key - final_key = key - if self.key_path == "random": - if key not in self.key_dict: - ext = os.path.splitext(key)[1] - self.key_dict[key] = f"{str(uuid.uuid4())}{ext}" - final_key = self.key_dict[key] - return os.path.join(self.folder, final_key) - - def get_cdn_url(self, key): - return self.cdn_url.format(bucket=self.bucket, region=self.region, key=self._get_path(key)) - - def exists(self, key): - try: - self.s3.head_object(Bucket=self.bucket, Key=self._get_path(key)) - return True - except ClientError: - return False - - def uploadf(self, file, key, **kwargs): - extra_args = kwargs.get("extra_args", {}) - if not self.private and 'ACL' not in extra_args: - extra_args['ACL'] = 'public-read' - - if 'ContentType' not in extra_args: - try: - extra_args['ContentType'] = mimetypes.guess_type(key)[0] - except Exception as e: - logger.error(f"Unable to get mimetype for {key=}, error: {e}") - - self.s3.upload_fileobj(file, Bucket=self.bucket, Key=self._get_path(key), ExtraArgs=extra_args) diff --git a/src/auto_archiver/storages/storage.py b/src/auto_archiver/storages/storage.py index 9d6a005..3e4134a 100644 --- a/src/auto_archiver/storages/storage.py +++ b/src/auto_archiver/storages/storage.py @@ -10,7 +10,7 @@ from slugify import slugify @dataclass -class StorageV2(Step): +class Storage(Step): name = "storage" def __init__(self, config: dict) -> None: @@ -18,8 +18,8 @@ class StorageV2(Step): super().__init__(config) # only for typing... - def init(name: str, config: dict) -> StorageV2: - return Step.init(name, config, StorageV2) + def init(name: str, config: dict) -> Storage: + return Step.init(name, config, Storage) def store(self, media: Media, item: Metadata) -> None: self.set_key(media, item)