From b763fc418838b30433e05877d396c02c7d50a0fb Mon Sep 17 00:00:00 2001
From: msramalho <19508417+msramalho@users.noreply.github.com>
Date: Sat, 21 Jan 2023 19:44:12 +0000
Subject: [PATCH] final naming cleanup + new feeders/dbs
---
.gitignore | 3 +-
src/auto_archiver/__init__.py | 2 +-
src/auto_archiver/__main__.py | 4 +-
src/auto_archiver/archivers/__init__.py | 18 +-
src/auto_archiver/archivers/archiver.py | 6 +-
src/auto_archiver/archivers/base_archiver.py | 384 ------------------
...am_archiverv2.py => instagram_archiver.py} | 4 +-
...ram_archiverv2.py => telegram_archiver.py} | 4 +-
...hon_archiverv2.py => telethon_archiver.py} | 4 +-
...iktok_archiverv2.py => tiktok_archiver.py} | 4 +-
..._archiverv2.py => twitter_api_archiver.py} | 6 +-
...tter_archiverv2.py => twitter_archiver.py} | 4 +-
.../{vk_archiverv2.py => vk_archiver.py} | 4 +-
...dl_archiverv2.py => youtubedl_archiver.py} | 4 +-
src/auto_archiver/auto_archive.py | 176 --------
src/auto_archiver/auto_auto_archive.py | 55 +--
src/auto_archiver/core/__init__.py | 2 +-
.../core/{v2config.py => config.py} | 16 +-
src/auto_archiver/core/metadata.py | 4 +-
src/auto_archiver/core/orchestrator.py | 16 +-
src/auto_archiver/core/step.py | 2 +-
src/auto_archiver/databases/__init__.py | 3 +-
src/auto_archiver/databases/csv_db.py | 33 ++
src/auto_archiver/databases/database.py | 1 -
.../enrichers/wayback_enricher.py | 4 +-
src/auto_archiver/feeders/__init__.py | 3 +-
src/auto_archiver/feeders/cli_feeder.py | 33 ++
src/auto_archiver/storages/__init__.py | 12 +-
src/auto_archiver/storages/base_storage.py | 33 --
src/auto_archiver/storages/gd_storage.py | 311 +++++++-------
src/auto_archiver/storages/local.py | 4 +-
src/auto_archiver/storages/s3.py | 4 +-
src/auto_archiver/storages/s3_storage.py | 80 ----
src/auto_archiver/storages/storage.py | 6 +-
34 files changed, 322 insertions(+), 927 deletions(-)
delete mode 100644 src/auto_archiver/archivers/base_archiver.py
rename src/auto_archiver/archivers/{instagram_archiverv2.py => instagram_archiver.py} (99%)
rename src/auto_archiver/archivers/{telegram_archiverv2.py => telegram_archiver.py} (97%)
rename src/auto_archiver/archivers/{telethon_archiverv2.py => telethon_archiver.py} (99%)
rename src/auto_archiver/archivers/{tiktok_archiverv2.py => tiktok_archiver.py} (96%)
rename src/auto_archiver/archivers/{twitter_api_archiverv2.py => twitter_api_archiver.py} (97%)
rename src/auto_archiver/archivers/{twitter_archiverv2.py => twitter_archiver.py} (98%)
rename src/auto_archiver/archivers/{vk_archiverv2.py => vk_archiver.py} (97%)
rename src/auto_archiver/archivers/{youtubedl_archiverv2.py => youtubedl_archiver.py} (97%)
delete mode 100644 src/auto_archiver/auto_archive.py
rename src/auto_archiver/core/{v2config.py => config.py} (93%)
create mode 100644 src/auto_archiver/databases/csv_db.py
create mode 100644 src/auto_archiver/feeders/cli_feeder.py
delete mode 100644 src/auto_archiver/storages/base_storage.py
delete mode 100644 src/auto_archiver/storages/s3_storage.py
diff --git a/.gitignore b/.gitignore
index 2c55563..b3a4b36 100644
--- a/.gitignore
+++ b/.gitignore
@@ -26,4 +26,5 @@ instaloader/*
instaloader.session
orchestration.yaml
auto_archiver.egg-info*
-logs*
\ No newline at end of file
+logs*
+*.csv
\ No newline at end of file
diff --git a/src/auto_archiver/__init__.py b/src/auto_archiver/__init__.py
index c02d330..adb8551 100644
--- a/src/auto_archiver/__init__.py
+++ b/src/auto_archiver/__init__.py
@@ -2,6 +2,6 @@ from . import archivers, databases, enrichers, feeders, formatters, storages, ut
# need to manually specify due to cyclical deps
from .core.orchestrator import ArchivingOrchestrator
-from .core.v2config import ConfigV2
+from .core.config import Config
# making accessible directly
from .core.metadata import Metadata
\ No newline at end of file
diff --git a/src/auto_archiver/__main__.py b/src/auto_archiver/__main__.py
index 812e4e2..e8a81c4 100644
--- a/src/auto_archiver/__main__.py
+++ b/src/auto_archiver/__main__.py
@@ -1,8 +1,8 @@
-from . import ConfigV2
+from . import Config
from . import ArchivingOrchestrator
def main():
- config = ConfigV2()
+ config = Config()
config.parse()
orchestrator = ArchivingOrchestrator(config)
orchestrator.feed()
diff --git a/src/auto_archiver/archivers/__init__.py b/src/auto_archiver/archivers/__init__.py
index 595ea8c..b5c4faa 100644
--- a/src/auto_archiver/archivers/__init__.py
+++ b/src/auto_archiver/archivers/__init__.py
@@ -10,12 +10,12 @@
# from .twitter_api_archiver import TwitterApiArchiver
# from .instagram_archiver import InstagramArchiver
-from .archiver import Archiverv2
-from .telethon_archiverv2 import TelethonArchiver
-from .twitter_archiverv2 import TwitterArchiver
-from .twitter_api_archiverv2 import TwitterApiArchiver
-from .instagram_archiverv2 import InstagramArchiver
-from .tiktok_archiverv2 import TiktokArchiver
-from .telegram_archiverv2 import TelegramArchiver
-from .vk_archiverv2 import VkArchiver
-from .youtubedl_archiverv2 import YoutubeDLArchiver
\ No newline at end of file
+from .archiver import Archiver
+from .telethon_archiver import TelethonArchiver
+from .twitter_archiver import TwitterArchiver
+from .twitter_api_archiver import TwitterApiArchiver
+from .instagram_archiver import InstagramArchiver
+from .tiktok_archiver import TiktokArchiver
+from .telegram_archiver import TelegramArchiver
+from .vk_archiver import VkArchiver
+from .youtubedl_archiver import YoutubeDLArchiver
\ No newline at end of file
diff --git a/src/auto_archiver/archivers/archiver.py b/src/auto_archiver/archivers/archiver.py
index d09044e..49efd66 100644
--- a/src/auto_archiver/archivers/archiver.py
+++ b/src/auto_archiver/archivers/archiver.py
@@ -8,16 +8,16 @@ from ..core import Step
@dataclass
-class Archiverv2(Step):
+class Archiver(Step):
name = "archiver"
def __init__(self, config: dict) -> None:
# without this STEP.__init__ is not called
super().__init__(config)
- def init(name: str, config: dict) -> Archiverv2:
+ def init(name: str, config: dict) -> Archiver:
# only for typing...
- return Step.init(name, config, Archiverv2)
+ return Step.init(name, config, Archiver)
def setup(self) -> None:
# used when archivers need to login or do other one-time setup
diff --git a/src/auto_archiver/archivers/base_archiver.py b/src/auto_archiver/archivers/base_archiver.py
deleted file mode 100644
index 103b9ff..0000000
--- a/src/auto_archiver/archivers/base_archiver.py
+++ /dev/null
@@ -1,384 +0,0 @@
-import os, datetime, shutil, hashlib, time, requests, re, mimetypes, subprocess
-from dataclasses import dataclass, field
-from abc import ABC, abstractmethod
-from urllib.parse import urlparse
-from random import randrange
-from collections import defaultdict
-
-import ffmpeg
-from loguru import logger
-from selenium.common.exceptions import TimeoutException
-from selenium.webdriver.common.by import By
-from slugify import slugify
-
-from ..configs import Config
-from storages import Storage
-from utils import mkdir_if_not_exists
-
-
-@dataclass
-class ArchiveResult:
- status: str
- cdn_url: str = None
- thumbnail: str = None
- thumbnail_index: str = None
- duration: float = None
- title: str = None
- timestamp: datetime.datetime = None
- screenshot: str = None
- wacz: str = None
- hash: str = None
- media: list = field(default_factory=list)
-
-class Archiver(ABC):
- name = "default"
- retry_regex = r"retrying at (\d+)$"
-
- def __init__(self, storage: Storage, config: Config):
- self.storage = storage
- self.driver = config.webdriver
- self.hash_algorithm = config.hash_algorithm
- self.browsertrix = config.browsertrix_config
- self.is_docker = config.is_docker
- self.media = []
-
- def __str__(self):
- return self.__class__.__name__
-
- def __repr__(self):
- return self.__str__()
-
- @abstractmethod
- def download(self, url, check_if_exists=False): pass
-
- def generateArchiveResult(self, **kwargs):
- # remove duplicates
- if "cdn_url" in kwargs:
- self.add_to_media(kwargs["cdn_url"], None, kwargs.get("hash"))
- kwargs["media"] = [dict(t) for t in {tuple(d.items()) for d in self.media}]
- return ArchiveResult(**kwargs)
-
- def get_netloc(self, url):
- return urlparse(url).netloc
-
- def add_to_media(self, cdn_url: str, key: str = None, hash: str = None):
- media_info = {"url": cdn_url, "mime": self._guess_file_type(cdn_url) or "misc"}
- if key: media_info["key"] = key
- if hash: media_info["hash"] = hash
- self.media.append(media_info)
-
- def generate_media_page_html(self, url, urls_info: dict, object, thumbnail=None):
- """
- Generates an index.html page where each @urls_info is displayed
- """
- for ui in urls_info:
- self.add_to_media(ui["cdn_url"], ui["key"], ui["hash"])
- page = f'''
{url}
-
- Archived media from {self.name}
- '''
-
- for url_info in urls_info:
- mime_global = self._guess_file_type(url_info["key"])
- preview = ""
- if mime_global == "image":
- preview = f'
'
- elif mime_global == "video":
- preview = f''
- page += f'''- {preview}{url_info['key']}: {url_info['hash']}
'''
-
- page += f"
{self.name} object data:
{object}"
- page += f""
-
- page_key = self.get_html_key(url)
- page_filename = os.path.join(Storage.TMP_FOLDER, page_key)
-
- with open(page_filename, "w") as f:
- f.write(page)
-
- page_hash = self.get_hash(page_filename)
-
- self.storage.upload(page_filename, page_key, extra_args={
- 'ACL': 'public-read', 'ContentType': 'text/html'})
-
- page_cdn = self.storage.get_cdn_url(page_key)
- return (page_cdn, page_hash, thumbnail)
-
- def _guess_file_type(self, path: str):
- """
- Receives a URL or filename and returns global mimetype like 'image' or 'video'
- see https://developer.mozilla.org/en-US/docs/Web/HTTP/Basics_of_HTTP/MIME_types/Common_types
- """
- mime = mimetypes.guess_type(path)[0]
- if mime is not None:
- return mime.split("/")[0]
- return ""
-
- def download_from_url(self, url, to_filename):
- headers = {
- 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36'
- }
- d = requests.get(url, headers=headers)
- with open(to_filename, 'wb') as f:
- f.write(d.content)
-
- def generate_media_page(self, urls, url, object):
- """
- For a list of media urls, fetch them, upload them
- and call self.generate_media_page_html with them
- """
- for media_url in urls:
- self.add_to_media(media_url)
-
- thumbnail = None
- uploaded_media = []
- for media_url in urls:
- key = self._get_key_from_url(media_url, ".jpg")
-
- filename = os.path.join(Storage.TMP_FOLDER, key)
- self.download_from_url(media_url, filename)
- self.storage.upload(filename, key)
- hash = self.get_hash(filename)
- cdn_url = self.storage.get_cdn_url(key)
-
- if thumbnail is None:
- thumbnail = cdn_url
- uploaded_media.append({'cdn_url': cdn_url, 'key': key, 'hash': hash})
-
- return self.generate_media_page_html(url, uploaded_media, object, thumbnail=thumbnail)
-
- def get_key(self, filename):
- """
- returns a key in the format "[archiverName]_[filename]" includes extension
- """
- tail = os.path.split(filename)[1] # returns filename.ext from full path
- _id, extension = os.path.splitext(tail) # returns [filename, .ext]
- if 'unknown_video' in _id:
- _id = _id.replace('unknown_video', 'jpg')
-
- # long filenames can cause problems, so trim them if necessary
- if len(_id) > 128:
- _id = _id[-128:]
-
- return f'{self.name}_{_id}{extension}'
-
- def get_html_key(self, url):
- return self._get_key_from_url(url, ".html")
-
- def _get_key_from_url(self, url, with_extension: str = None, append_datetime: bool = False):
- """
- Receives a URL and returns a slugified version of the URL path
- if a string is passed in @with_extension the slug is appended with it if there is no "." in the slug
- if @append_date is true, the key adds a timestamp after the URL slug and before the extension
- """
- url_path = urlparse(url).path
- path, ext = os.path.splitext(url_path)
- slug = slugify(path)
- if append_datetime:
- slug += "-" + slugify(datetime.datetime.utcnow().isoformat())
- if len(ext):
- slug += ext
- if with_extension is not None:
- if "." not in slug:
- slug += with_extension
- return self.get_key(slug)
-
- def get_hash(self, filename):
- with open(filename, "rb") as f:
- bytes = f.read() # read entire file as bytes
- logger.debug(f'Hash algorithm is {self.hash_algorithm}')
-
- if self.hash_algorithm == "SHA-256": hash = hashlib.sha256(bytes)
- elif self.hash_algorithm == "SHA3-512": hash = hashlib.sha3_512(bytes)
- else: raise Exception(f"Unknown Hash Algorithm of {self.hash_algorithm}")
-
- return hash.hexdigest()
-
- def get_screenshot(self, url):
- logger.debug(f"getting screenshot for {url=}")
- key = self._get_key_from_url(url, ".png", append_datetime=True)
- filename = os.path.join(Storage.TMP_FOLDER, key)
-
- # Accept cookies popup dismiss for ytdlp video
- if 'facebook.com' in url:
- try:
- logger.debug(f'Trying fb click accept cookie popup for {url}')
- self.driver.get("http://www.facebook.com")
- foo = self.driver.find_element(By.XPATH, "//button[@data-cookiebanner='accept_only_essential_button']")
- foo.click()
- logger.debug(f'fb click worked')
- # linux server needs a sleep otherwise facebook cookie won't have worked and we'll get a popup on next page
- time.sleep(2)
- except:
- logger.warning(f'Failed on fb accept cookies for url {url}')
-
- try:
- self.driver.get(url)
- time.sleep(6)
- except TimeoutException:
- logger.info("TimeoutException loading page for screenshot")
-
- self.driver.save_screenshot(filename)
- self.storage.upload(filename, key, extra_args={'ACL': 'public-read', 'ContentType': 'image/png'})
-
- cdn_url = self.storage.get_cdn_url(key)
- self.add_to_media(cdn_url, key)
-
- return cdn_url
-
- def get_wacz(self, url):
- if not self.browsertrix.enabled:
- logger.debug(f"Browsertrix WACZ generation is not enabled, skipping.")
- return
- if self.is_docker:
- # TODO: figure out support for browsertrix in docker
- # see: https://github.com/bellingcat/auto-archiver/issues/66
- logger.warning(f"Browsertrix WACZ is not yet supported when using DOCKER.")
- return
-
- logger.debug(f"getting wacz for {url}")
- key = self._get_key_from_url(url, ".wacz", append_datetime=True)
- collection = re.sub('[^0-9a-zA-Z]+', '', key.replace(".wacz", ""))
-
- browsertrix_home = os.path.join(os.getcwd(), "browsertrix-tmp")
- cmd = [
- "docker", "run",
- "--rm", # delete container once it has completed running
- "-v", f"{browsertrix_home}:/crawls/",
- # "-it", # this leads to "the input device is not a TTY"
- "webrecorder/browsertrix-crawler", "crawl",
- "--url", url,
- "--scopeType", "page",
- "--generateWACZ",
- "--text",
- "--collection", collection,
- "--behaviors", "autoscroll,autoplay,autofetch,siteSpecific",
- "--behaviorTimeout", str(self.browsertrix.timeout_seconds),
- "--timeout", str(self.browsertrix.timeout_seconds)
- ]
-
- if not os.path.isdir(browsertrix_home):
- os.mkdir(browsertrix_home)
-
- if self.browsertrix.profile:
- shutil.copyfile(self.browsertrix.profile, os.path.join(browsertrix_home, "profile.tar.gz"))
- cmd.extend(["--profile", "/crawls/profile.tar.gz"])
-
- try:
- logger.info(f"Running browsertrix-crawler: {' '.join(cmd)}")
- subprocess.run(cmd, check=True)
- except Exception as e:
- logger.error(f"WACZ generation failed: {e}")
- return
-
- filename = os.path.join(browsertrix_home, "collections", collection, f"{collection}.wacz")
-
- # do not crash if upload fails
- try:
- self.storage.upload(filename, key, extra_args={
- 'ACL': 'public-read', 'ContentType': 'application/zip'})
- except FileNotFoundError as e:
- logger.warning(f"Unable to locate and upload WACZ {filename=}, {key=}")
-
- # clean up the local browsertrix files
- try:
- shutil.rmtree(browsertrix_home)
- except PermissionError:
- logger.warn(f"Unable to clean up browsertrix-crawler files in {browsertrix_home}")
-
- cdn_url = self.storage.get_cdn_url(key)
- self.add_to_media(cdn_url, key)
- return cdn_url
-
- def get_thumbnails(self, filename, key, duration=None):
- thumbnails_folder = os.path.splitext(filename)[0] + os.path.sep
- key_folder = key.split('.')[0] + os.path.sep
-
- mkdir_if_not_exists(thumbnails_folder)
-
- fps = 0.5
- if duration is not None:
- duration = float(duration)
-
- if duration < 60:
- fps = 10.0 / duration
- elif duration < 120:
- fps = 20.0 / duration
- else:
- fps = 40.0 / duration
-
- stream = ffmpeg.input(filename)
- stream = ffmpeg.filter(stream, 'fps', fps=fps).filter('scale', 512, -1)
- stream.output(thumbnails_folder + 'out%d.jpg').run()
-
- thumbnails = os.listdir(thumbnails_folder)
- cdn_urls = []
- for fname in thumbnails:
- if fname[-3:] == 'jpg':
- thumbnail_filename = thumbnails_folder + fname
- key = os.path.join(key_folder, fname)
-
- self.storage.upload(thumbnail_filename, key)
- cdn_url = self.storage.get_cdn_url(key)
- cdn_urls.append(cdn_url)
-
- if len(cdn_urls) == 0:
- return ('', '')
-
- key_thumb = cdn_urls[int(len(cdn_urls) * 0.1)]
-
- index_page = f'''{filename}
- '''
-
- for t in cdn_urls:
- index_page += f'
'
-
- index_page += f""
- index_fname = thumbnails_folder + 'index.html'
-
- with open(index_fname, 'w') as f:
- f.write(index_page)
-
- thumb_index = key_folder + 'index.html'
-
- self.storage.upload(index_fname, thumb_index, extra_args={
- 'ACL': 'public-read', 'ContentType': 'text/html'})
- shutil.rmtree(thumbnails_folder)
-
- thumb_index_cdn_url = self.storage.get_cdn_url(thumb_index)
-
- return (key_thumb, thumb_index_cdn_url)
-
- def signal_retry_in(self, min_seconds=1800, max_seconds=7200, **kwargs):
- """
- sets state to retry in random between (min_seconds, max_seconds)
- """
- now = datetime.datetime.now().timestamp()
- retry_at = int(now + randrange(min_seconds, max_seconds))
- logger.debug(f"signaling {retry_at=}")
- return ArchiveResult(status=f'retrying at {retry_at}', **kwargs)
-
- def is_retry(status):
- return re.search(Archiver.retry_regex, status) is not None
-
- def should_retry_from_status(status):
- """
- checks status against message in signal_retry_in
- returns true if enough time has elapsed, false otherwise
- """
- match = re.search(Archiver.retry_regex, status)
- if match:
- retry_at = int(match.group(1))
- now = datetime.datetime.now().timestamp()
- should_retry = now >= retry_at
- logger.debug(f"{should_retry=} since {now=} and {retry_at=}")
- return should_retry
- return False
-
- def remove_retry(status):
- """
- transforms the status from retry into something else
- """
- new_status = re.sub(Archiver.retry_regex, "failed: too many retries", status, 0)
- logger.debug(f"removing retry message at {status=}, got {new_status=}")
- return new_status
diff --git a/src/auto_archiver/archivers/instagram_archiverv2.py b/src/auto_archiver/archivers/instagram_archiver.py
similarity index 99%
rename from src/auto_archiver/archivers/instagram_archiverv2.py
rename to src/auto_archiver/archivers/instagram_archiver.py
index 126622a..c0d1b2c 100644
--- a/src/auto_archiver/archivers/instagram_archiverv2.py
+++ b/src/auto_archiver/archivers/instagram_archiver.py
@@ -2,11 +2,11 @@ import re, os, shutil, html, traceback
import instaloader # https://instaloader.github.io/as-module.html
from loguru import logger
-from . import Archiverv2
+from . import Archiver
from ..core import Metadata
from ..core import Media
-class InstagramArchiver(Archiverv2):
+class InstagramArchiver(Archiver):
"""
Uses Instaloader to download either a post (inc images, videos, text) or as much as possible from a profile (posts, stories, highlights, ...)
"""
diff --git a/src/auto_archiver/archivers/telegram_archiverv2.py b/src/auto_archiver/archivers/telegram_archiver.py
similarity index 97%
rename from src/auto_archiver/archivers/telegram_archiverv2.py
rename to src/auto_archiver/archivers/telegram_archiver.py
index 2f4bf23..b579dc4 100644
--- a/src/auto_archiver/archivers/telegram_archiverv2.py
+++ b/src/auto_archiver/archivers/telegram_archiver.py
@@ -4,12 +4,12 @@ import html
from bs4 import BeautifulSoup
from loguru import logger
-from . import Archiverv2
+from . import Archiver
from ..core import Metadata
from ..core import Media
-class TelegramArchiver(Archiverv2):
+class TelegramArchiver(Archiver):
"""
Archiver for telegram that does not require login, but the telethon_archiver is much more advised, will only return if at least one image or one video is found
"""
diff --git a/src/auto_archiver/archivers/telethon_archiverv2.py b/src/auto_archiver/archivers/telethon_archiver.py
similarity index 99%
rename from src/auto_archiver/archivers/telethon_archiverv2.py
rename to src/auto_archiver/archivers/telethon_archiver.py
index 57d27e1..20bf28e 100644
--- a/src/auto_archiver/archivers/telethon_archiverv2.py
+++ b/src/auto_archiver/archivers/telethon_archiver.py
@@ -8,12 +8,12 @@ from loguru import logger
from tqdm import tqdm
import re, time, json, os
-from . import Archiverv2
+from . import Archiver
from ..core import Metadata
from ..core import Media
-class TelethonArchiver(Archiverv2):
+class TelethonArchiver(Archiver):
name = "telethon_archiver"
link_pattern = re.compile(r"https:\/\/t\.me(\/c){0,1}\/(.+)\/(\d+)")
invite_pattern = re.compile(r"t.me(\/joinchat){0,1}\/\+?(.+)")
diff --git a/src/auto_archiver/archivers/tiktok_archiverv2.py b/src/auto_archiver/archivers/tiktok_archiver.py
similarity index 96%
rename from src/auto_archiver/archivers/tiktok_archiverv2.py
rename to src/auto_archiver/archivers/tiktok_archiver.py
index ea7f670..0c41193 100644
--- a/src/auto_archiver/archivers/tiktok_archiverv2.py
+++ b/src/auto_archiver/archivers/tiktok_archiver.py
@@ -5,12 +5,12 @@ import uuid
import tiktok_downloader
from loguru import logger
-from . import Archiverv2
+from . import Archiver
from ..core import Metadata
from ..core import Media
-class TiktokArchiver(Archiverv2):
+class TiktokArchiver(Archiver):
name = "tiktok_archiver"
def __init__(self, config: dict) -> None:
diff --git a/src/auto_archiver/archivers/twitter_api_archiverv2.py b/src/auto_archiver/archivers/twitter_api_archiver.py
similarity index 97%
rename from src/auto_archiver/archivers/twitter_api_archiverv2.py
rename to src/auto_archiver/archivers/twitter_api_archiver.py
index 007193c..821d8d4 100644
--- a/src/auto_archiver/archivers/twitter_api_archiverv2.py
+++ b/src/auto_archiver/archivers/twitter_api_archiver.py
@@ -7,13 +7,13 @@ from loguru import logger
from pytwitter import Api
from slugify import slugify
-from . import Archiverv2
-from .twitter_archiverv2 import TwitterArchiver
+from . import Archiver
+from .twitter_archiver import TwitterArchiver
from ..core import Metadata
from ..core import Media
-class TwitterApiArchiver(TwitterArchiver, Archiverv2):
+class TwitterApiArchiver(TwitterArchiver, Archiver):
name = "twitter_api_archiver"
def __init__(self, config: dict) -> None:
diff --git a/src/auto_archiver/archivers/twitter_archiverv2.py b/src/auto_archiver/archivers/twitter_archiver.py
similarity index 98%
rename from src/auto_archiver/archivers/twitter_archiverv2.py
rename to src/auto_archiver/archivers/twitter_archiver.py
index 58942ab..194025d 100644
--- a/src/auto_archiver/archivers/twitter_archiverv2.py
+++ b/src/auto_archiver/archivers/twitter_archiver.py
@@ -7,11 +7,11 @@ from loguru import logger
from snscrape.modules.twitter import TwitterTweetScraper, Video, Gif, Photo
from slugify import slugify
-from . import Archiverv2
+from . import Archiver
from ..core import Metadata
from ..core import Media
-class TwitterArchiver(Archiverv2):
+class TwitterArchiver(Archiver):
"""
This Twitter Archiver uses unofficial scraping methods.
"""
diff --git a/src/auto_archiver/archivers/vk_archiverv2.py b/src/auto_archiver/archivers/vk_archiver.py
similarity index 97%
rename from src/auto_archiver/archivers/vk_archiverv2.py
rename to src/auto_archiver/archivers/vk_archiver.py
index 4c2ad98..1d76282 100644
--- a/src/auto_archiver/archivers/vk_archiverv2.py
+++ b/src/auto_archiver/archivers/vk_archiver.py
@@ -2,12 +2,12 @@ from loguru import logger
from vk_url_scraper import VkScraper
from ..utils.misc import dump_payload
-from . import Archiverv2
+from . import Archiver
from ..core import Metadata
from ..core import Media
-class VkArchiver(Archiverv2):
+class VkArchiver(Archiver):
""""
VK videos are handled by YTDownloader, this archiver gets posts text and images.
Currently only works for /wall posts
diff --git a/src/auto_archiver/archivers/youtubedl_archiverv2.py b/src/auto_archiver/archivers/youtubedl_archiver.py
similarity index 97%
rename from src/auto_archiver/archivers/youtubedl_archiverv2.py
rename to src/auto_archiver/archivers/youtubedl_archiver.py
index 0b5916c..443fb17 100644
--- a/src/auto_archiver/archivers/youtubedl_archiverv2.py
+++ b/src/auto_archiver/archivers/youtubedl_archiver.py
@@ -4,12 +4,12 @@ import os
import yt_dlp
from loguru import logger
-from . import Archiverv2
+from . import Archiver
from ..core import Metadata
from ..core import Media
-class YoutubeDLArchiver(Archiverv2):
+class YoutubeDLArchiver(Archiver):
name = "youtubedl_enricher"
def __init__(self, config: dict) -> None:
diff --git a/src/auto_archiver/auto_archive.py b/src/auto_archiver/auto_archive.py
deleted file mode 100644
index a797405..0000000
--- a/src/auto_archiver/auto_archive.py
+++ /dev/null
@@ -1,176 +0,0 @@
-import os, datetime, traceback, random, tempfile
-
-from loguru import logger
-from slugify import slugify
-from urllib.parse import quote
-
-from archivers import TelethonArchiver, TelegramArchiver, TiktokArchiver, YoutubeDLArchiver, TwitterArchiver, TwitterApiArchiver, VkArchiver, WaybackArchiver, InstagramArchiver, ArchiveResult, Archiver
-from utils import GWorksheet, expand_url
-from configs import Config
-from storages import Storage
-
-random.seed()
-
-
-def update_sheet(gw, row, url, result: ArchiveResult):
- cell_updates = []
- row_values = gw.get_row(row)
-
- def batch_if_valid(col, val, final_value=None):
- final_value = final_value or val
- if val and gw.col_exists(col) and gw.get_cell(row_values, col) == '':
- cell_updates.append((row, col, final_value))
-
- cell_updates.append((row, 'status', result.status))
-
- batch_if_valid('archive', result.cdn_url)
- batch_if_valid('date', True, datetime.datetime.utcnow().replace(tzinfo=datetime.timezone.utc).isoformat())
- batch_if_valid('thumbnail', result.thumbnail, f'=IMAGE("{result.thumbnail}")')
- batch_if_valid('thumbnail_index', result.thumbnail_index)
- batch_if_valid('title', result.title)
- batch_if_valid('duration', result.duration, str(result.duration))
- batch_if_valid('screenshot', result.screenshot)
- batch_if_valid('hash', result.hash)
- if result.wacz is not None:
- batch_if_valid('wacz', result.wacz)
- batch_if_valid('replaywebpage', f'https://replayweb.page/?source={quote(result.wacz)}#view=pages&url={quote(url)}')
-
- if result.timestamp is not None:
- if type(result.timestamp) == int:
- timestamp_string = datetime.datetime.fromtimestamp(result.timestamp).replace(tzinfo=datetime.timezone.utc).isoformat()
- elif type(result.timestamp) == str:
- timestamp_string = result.timestamp
- else:
- timestamp_string = result.timestamp.isoformat()
-
- batch_if_valid('timestamp', timestamp_string)
-
- gw.batch_set_cell(cell_updates)
-
-
-def missing_required_columns(gw: GWorksheet):
- missing = False
- for required_col in ['url', 'status']:
- if not gw.col_exists(required_col):
- logger.warning(f'Required column for {required_col}: "{gw.columns[required_col]}" not found, skipping worksheet {gw.wks.title}')
- missing = True
- return missing
-
-
-def should_process_sheet(c: Config, sheet_name):
- if len(c.worksheet_allow) and sheet_name not in c.worksheet_allow:
- # ALLOW rules exist AND sheet name not explicitly allowed
- return False
- if len(c.worksheet_block) and sheet_name in c.worksheet_block:
- # BLOCK rules exist AND sheet name is blocked
- return False
- return True
-
-
-def archive_url(c: Config, url: str, folder: str, debug_string: str, is_retry: bool):
- url = expand_url(url)
- c.set_folder(folder)
- storage = c.get_storage()
-
- # make a new driver so each spreadsheet row is idempotent
- c.recreate_webdriver()
-
- # order matters, first to succeed excludes remaining
- active_archivers = [
- TelethonArchiver(storage, c),
- TiktokArchiver(storage, c),
- TwitterApiArchiver(storage, c),
- InstagramArchiver(storage, c),
- YoutubeDLArchiver(storage, c),
- TelegramArchiver(storage, c),
- TwitterArchiver(storage, c),
- VkArchiver(storage, c),
- WaybackArchiver(storage, c)
- ]
-
- for archiver in active_archivers:
- logger.debug(f'Trying {archiver} on {debug_string}')
-
- try:
- result = archiver.download(url, check_if_exists=c.check_if_exists)
- except KeyboardInterrupt as e: raise e # so the higher level catch can catch it
- except Exception as e:
- result = False
- logger.error(f'Got unexpected error in {debug_string} with {archiver.name} for {url=}: {e}\n{traceback.format_exc()}')
-
- if result:
- success = result.status in ['success', 'already archived']
- result.status = f"{archiver.name}: {result.status}"
- if success:
- logger.success(f'{archiver.name} succeeded on {debug_string}, {url=}')
- break
- # only 1 retry possible for now
- if is_retry and Archiver.is_retry(result.status):
- result.status = Archiver.remove_retry(result.status)
- logger.warning(f'{archiver.name} did not succeed on {debug_string}, final status: {result.status}')
- return result
-
-
-def process_sheet(c: Config):
- sh = c.gsheets_client.open(c.sheet)
-
- # loop through worksheets to check
- for ii, wks in enumerate(sh.worksheets()):
- if not should_process_sheet(c, wks.title):
- logger.info(f'Ignoring worksheet "{wks.title}" due to allow/block configurations')
- continue
-
- logger.info(f'Opening worksheet {ii=}: {wks.title=} {c.header=}')
- gw = GWorksheet(wks, header_row=c.header, columns=c.column_names)
-
- if missing_required_columns(gw): continue
-
- # archives will default to being in a folder 'doc_name/worksheet_name'
- default_folder = os.path.join(slugify(c.sheet), slugify(wks.title))
- c.set_folder(default_folder)
- storage = c.get_storage()
-
- # loop through rows in worksheet
- for row in range(1 + c.header, gw.count_rows() + 1):
- url = gw.get_cell(row, 'url')
- original_status = gw.get_cell(row, 'status')
- status = gw.get_cell(row, 'status', fresh=original_status in ['', None] and url != '')
-
- is_retry = False
- if url == '' or status not in ['', None]:
- is_retry = Archiver.should_retry_from_status(status)
- if not is_retry: continue
-
- # All checks done - archival process starts here
- try:
- gw.set_cell(row, 'status', 'Archive in progress')
- result = archive_url(c, url, gw.get_cell_or_default(row, 'folder', default_folder, when_empty_use_default=True), f"{row=}", is_retry=is_retry)
- if result:
- update_sheet(gw, row, url, result)
- else:
- gw.set_cell(row, 'status', 'failed: no archiver')
- except KeyboardInterrupt:
- # catches keyboard interruptions to do a clean exit
- logger.warning(f"caught interrupt on {row=}, {url=}")
- gw.set_cell(row, 'status', '')
- c.destroy_webdriver()
- exit()
- except Exception as e:
- logger.error(f'Got unexpected error in row {row} for {url=}: {e}\n{traceback.format_exc()}')
- gw.set_cell(row, 'status', 'failed: unexpected error (see logs)')
- logger.success(f'Finished worksheet {wks.title}')
-
-
-@logger.catch
-def main():
- c = Config()
- c.parse()
- logger.info(f'Opening document {c.sheet} for header {c.header}')
- with tempfile.TemporaryDirectory(dir="./") as tmpdir:
- Storage.TMP_FOLDER = tmpdir
- process_sheet(c)
- c.destroy_webdriver()
-
-
-if __name__ == '__main__':
- main()
diff --git a/src/auto_archiver/auto_auto_archive.py b/src/auto_archiver/auto_auto_archive.py
index 8c794c5..660be4f 100644
--- a/src/auto_archiver/auto_auto_archive.py
+++ b/src/auto_archiver/auto_auto_archive.py
@@ -1,29 +1,34 @@
-import tempfile
-import auto_archive
-from loguru import logger
-from configs import Config
-from storages import Storage
+
+#TODO: refactor GDriveStorage before merging to main
+# is it possible to have something like this with the new pipeline?
-def main():
- c = Config()
- c.parse()
- logger.info(f'Opening document {c.sheet} to look for sheet names to archive')
-
- gc = c.gsheets_client
- sh = gc.open(c.sheet)
-
- wks = sh.get_worksheet(0)
- values = wks.get_all_values()
-
- with tempfile.TemporaryDirectory(dir="./") as tmpdir:
- Storage.TMP_FOLDER = tmpdir
- for i in range(11, len(values)):
- c.sheet = values[i][0]
- logger.info(f"Processing {c.sheet}")
- auto_archive.process_sheet(c)
- c.destroy_webdriver()
+# # import tempfile
+# import auto_archive
+# from loguru import logger
+# from configs import Config
+# from storages import Storage
-if __name__ == "__main__":
- main()
+# def main():
+# c = Config()
+# c.parse()
+# logger.info(f'Opening document {c.sheet} to look for sheet names to archive')
+
+# gc = c.gsheets_client
+# sh = gc.open(c.sheet)
+
+# wks = sh.get_worksheet(0)
+# values = wks.get_all_values()
+
+# with tempfile.TemporaryDirectory(dir="./") as tmpdir:
+# Storage.TMP_FOLDER = tmpdir
+# for i in range(11, len(values)):
+# c.sheet = values[i][0]
+# logger.info(f"Processing {c.sheet}")
+# auto_archive.process_sheet(c)
+# c.destroy_webdriver()
+
+
+# if __name__ == "__main__":
+# main()
diff --git a/src/auto_archiver/core/__init__.py b/src/auto_archiver/core/__init__.py
index a171478..04f381e 100644
--- a/src/auto_archiver/core/__init__.py
+++ b/src/auto_archiver/core/__init__.py
@@ -4,4 +4,4 @@ from .step import Step
# cannot import ArchivingOrchestrator/Config to avoid circular dep
# from .orchestrator import ArchivingOrchestrator
-# from .v2config import ConfigV2
\ No newline at end of file
+# from .config import Config
\ No newline at end of file
diff --git a/src/auto_archiver/core/v2config.py b/src/auto_archiver/core/config.py
similarity index 93%
rename from src/auto_archiver/core/v2config.py
rename to src/auto_archiver/core/config.py
index 3db6618..73bd585 100644
--- a/src/auto_archiver/core/v2config.py
+++ b/src/auto_archiver/core/config.py
@@ -5,31 +5,31 @@ from dataclasses import dataclass, field
from typing import List
from collections import defaultdict
-from ..archivers import Archiverv2
+from ..archivers import Archiver
from ..feeders import Feeder
from ..databases import Database
from ..formatters import Formatter
-from ..storages import StorageV2
+from ..storages import Storage
from . import Step
from ..enrichers import Enricher
@dataclass
-class ConfigV2:
+class Config:
# TODO: should Config inherit from Step so it can have it's own configurations?
# these are only detected if they are put to the respective __init__.py
configurable_parents = [
Feeder,
Enricher,
- Archiverv2,
+ Archiver,
Database,
- StorageV2,
+ Storage,
Formatter
# Util
]
feeder: Step # TODO:= BaseFeeder
formatter: Formatter
- archivers: List[Archiverv2] = field(default_factory=[]) # TODO: fix type
+ archivers: List[Archiver] = field(default_factory=[]) # TODO: fix type
enrichers: List[Enricher] = field(default_factory=[])
storages: List[Step] = field(default_factory=[]) # TODO: fix type
databases: List[Database] = field(default_factory=[])
@@ -107,9 +107,9 @@ class ConfigV2:
self.feeder = Feeder.init(steps.get("feeder", "cli_feeder"), self.config)
self.formatter = Formatter.init(steps.get("formatter", "html_formatter"), self.config)
self.enrichers = [Enricher.init(e, self.config) for e in steps.get("enrichers", [])]
- self.archivers = [Archiverv2.init(e, self.config) for e in (steps.get("archivers") or [])]
+ self.archivers = [Archiver.init(e, self.config) for e in (steps.get("archivers") or [])]
self.databases = [Database.init(e, self.config) for e in steps.get("databases", [])]
- self.storages = [StorageV2.init(e, self.config) for e in steps.get("storages", [])]
+ self.storages = [Storage.init(e, self.config) for e in steps.get("storages", [])]
print("feeder", self.feeder)
print("enrichers", [e for e in self.enrichers])
diff --git a/src/auto_archiver/core/metadata.py b/src/auto_archiver/core/metadata.py
index bb3079c..f09d7e0 100644
--- a/src/auto_archiver/core/metadata.py
+++ b/src/auto_archiver/core/metadata.py
@@ -4,7 +4,7 @@ from ast import List, Set
from typing import Any, Union, Dict
from dataclasses import dataclass, field
from dataclasses_json import dataclass_json
-import datetime, mimetypes
+import datetime
from urllib.parse import urlparse
from loguru import logger
from dateutil.parser import parse as parse_dt
@@ -17,7 +17,7 @@ class Metadata:
status: str = "no archiver"
_processed_at: datetime = datetime.datetime.utcnow().replace(tzinfo=datetime.timezone.utc)
metadata: Dict[str, Any] = field(default_factory=dict)
- tmp_keys: Set[str] = field(default_factory=set, repr=False) # keys that are not to be saved in DBs
+ tmp_keys: Set[str] = field(default_factory=set, repr=False, metadata={"exclude":True}) # keys that are not to be saved in DBs
media: List[Media] = field(default_factory=list)
final_media: Media = None # can be overwritten by formatters
rearchivable: bool = False
diff --git a/src/auto_archiver/core/orchestrator.py b/src/auto_archiver/core/orchestrator.py
index 19a35aa..bb7902e 100644
--- a/src/auto_archiver/core/orchestrator.py
+++ b/src/auto_archiver/core/orchestrator.py
@@ -3,10 +3,10 @@ from ast import List
from typing import Union, Dict
from dataclasses import dataclass
-from ..archivers import Archiverv2
+from ..archivers import Archiver
from ..feeders import Feeder
from ..formatters import Formatter
-from ..storages import StorageV2
+from ..storages import Storage
from ..enrichers import Enricher
from ..databases import Database
from .media import Media
@@ -59,9 +59,9 @@ class ArchivingOrchestrator:
self.feeder: Feeder = config.feeder
self.formatter: Formatter = config.formatter
self.enrichers = config.enrichers
- self.archivers: List[Archiverv2] = config.archivers
+ self.archivers: List[Archiver] = config.archivers
self.databases: List[Database] = config.databases
- self.storages: List[StorageV2] = config.storages
+ self.storages: List[Storage] = config.storages
for a in self.archivers: a.setup()
@@ -69,12 +69,12 @@ class ArchivingOrchestrator:
for item in self.feeder:
self.feed_item(item)
- def feed_item(self, item:Metadata) -> Metadata:
+ def feed_item(self, item: Metadata) -> Metadata:
print("ARCHIVING", item)
try:
with tempfile.TemporaryDirectory(dir="./") as tmp_dir:
item.set_tmp_dir(tmp_dir)
- result = self.archive(item)
+ return self.archive(item)
except KeyboardInterrupt:
# catches keyboard interruptions to do a clean exit
logger.warning(f"caught interrupt on {item=}")
@@ -84,8 +84,6 @@ class ArchivingOrchestrator:
logger.error(f'Got unexpected error on item {item}: {e}\n{traceback.format_exc()}')
for d in self.databases: d.failed(item)
- return result
-
# how does this handle the parameters like folder which can be different for each archiver?
# the storage needs to know where to archive!!
# solution: feeders have context: extra metadata that they can read or ignore,
@@ -154,7 +152,7 @@ class ArchivingOrchestrator:
for prop in m.properties.values():
if isinstance(prop, Media):
s.store(prop, result)
- if isinstance(prop, list) and len(prop)>0 and isinstance(prop[0], Media):
+ if isinstance(prop, list) and len(prop) > 0 and isinstance(prop[0], Media):
for prop_media in prop:
s.store(prop_media, result)
diff --git a/src/auto_archiver/core/step.py b/src/auto_archiver/core/step.py
index ae10869..4c04c94 100644
--- a/src/auto_archiver/core/step.py
+++ b/src/auto_archiver/core/step.py
@@ -28,7 +28,7 @@ class Step(ABC):
for sub in child.__subclasses__():
if sub.name == name:
return sub(config)
- raise ClassFoundException(f"Unable to initialize STEP with {name=}, check your configuration file/step names.")
+ raise ClassFoundException(f"Unable to initialize STEP with {name=}, check your configuration file/step names, and make sure you made the step discoverable by putting it into __init__.py")
def assert_valid_string(self, prop: str) -> None:
"""
diff --git a/src/auto_archiver/databases/__init__.py b/src/auto_archiver/databases/__init__.py
index 3a3e907..1e676ea 100644
--- a/src/auto_archiver/databases/__init__.py
+++ b/src/auto_archiver/databases/__init__.py
@@ -1,3 +1,4 @@
from .database import Database
from .gsheet_db import GsheetsDb
-from .console_db import ConsoleDb
\ No newline at end of file
+from .console_db import ConsoleDb
+from .csv_db import CSVDb
\ No newline at end of file
diff --git a/src/auto_archiver/databases/csv_db.py b/src/auto_archiver/databases/csv_db.py
new file mode 100644
index 0000000..72c804b
--- /dev/null
+++ b/src/auto_archiver/databases/csv_db.py
@@ -0,0 +1,33 @@
+import os
+from loguru import logger
+
+from . import Database
+from ..core import Metadata
+from csv import DictWriter
+from dataclasses import asdict
+
+class CSVDb(Database):
+ """
+ Outputs results to a CSV file
+ """
+ name = "csv_db"
+
+ def __init__(self, config: dict) -> None:
+ # without this STEP.__init__ is not called
+ super().__init__(config)
+ self.assert_valid_string("csv_file")
+
+ @staticmethod
+ def configs() -> dict:
+ return {
+ "csv_file": {"default": "db.csv", "help": "CSV file name"}
+ }
+
+ def done(self, item: Metadata) -> None:
+ """archival result ready - should be saved to DB"""
+ logger.success(f"DONE {item}")
+ is_empty = not os.path.isfile(self.csv_file) or os.path.getsize(self.csv_file) == 0
+ with open(self.csv_file, "a", encoding="utf-8") as outf:
+ writer = DictWriter(outf, fieldnames=asdict(Metadata()))
+ if is_empty: writer.writeheader()
+ writer.writerow(asdict(item))
diff --git a/src/auto_archiver/databases/database.py b/src/auto_archiver/databases/database.py
index 3f09c4a..133afaa 100644
--- a/src/auto_archiver/databases/database.py
+++ b/src/auto_archiver/databases/database.py
@@ -18,7 +18,6 @@ class Database(Step, ABC):
# only for typing...
return Step.init(name, config, Database)
- @abstractmethod
def started(self, item: Metadata) -> None:
"""signals the DB that the given item archival has started"""
pass
diff --git a/src/auto_archiver/enrichers/wayback_enricher.py b/src/auto_archiver/enrichers/wayback_enricher.py
index 945b148..187c3fe 100644
--- a/src/auto_archiver/enrichers/wayback_enricher.py
+++ b/src/auto_archiver/enrichers/wayback_enricher.py
@@ -2,10 +2,10 @@ from loguru import logger
import time, requests
from . import Enricher
-from ..archivers import Archiverv2
+from ..archivers import Archiver
from ..core import Metadata
-class WaybackArchiverEnricher(Enricher, Archiverv2):
+class WaybackArchiverEnricher(Enricher, Archiver):
"""
Submits the current URL to the webarchive and returns a job_id or completed archive
"""
diff --git a/src/auto_archiver/feeders/__init__.py b/src/auto_archiver/feeders/__init__.py
index b11cd50..f42f98f 100644
--- a/src/auto_archiver/feeders/__init__.py
+++ b/src/auto_archiver/feeders/__init__.py
@@ -1,2 +1,3 @@
from.feeder import Feeder
-from .gsheet_feeder import GsheetsFeeder
\ No newline at end of file
+from .gsheet_feeder import GsheetsFeeder
+from .cli_feeder import CLIFeeder
\ No newline at end of file
diff --git a/src/auto_archiver/feeders/cli_feeder.py b/src/auto_archiver/feeders/cli_feeder.py
new file mode 100644
index 0000000..6d52085
--- /dev/null
+++ b/src/auto_archiver/feeders/cli_feeder.py
@@ -0,0 +1,33 @@
+import gspread, os
+
+# from metadata import Metadata
+from loguru import logger
+
+# from . import Enricher
+from . import Feeder
+from ..core import Metadata
+
+
+class CLIFeeder(Feeder):
+ name = "cli_feeder"
+
+ def __init__(self, config: dict) -> None:
+ # without this STEP.__init__ is not called
+ super().__init__(config)
+ assert type(self.urls) == list and len(self.urls) > 0, "Please provide a CSV list of URL(s) to process, with --cli_feeder.urls='url1,url2,url3'"
+
+ @staticmethod
+ def configs() -> dict:
+ return {
+ "urls": {
+ "default": None,
+ "help": "URL(s) to archive, either a single URL or a list of urls, should not come from config.yaml",
+ "cli_set": lambda cli_val, cur_val: list(set(cli_val.split(",")))
+ },
+ }
+
+ def __iter__(self) -> Metadata:
+ for url in self.urls:
+ logger.debug(f"Processing {url}")
+ yield Metadata().set_url(url).set("folder", "cli", True)
+ logger.success(f"Processed {len(self.urls)} URL(s)")
diff --git a/src/auto_archiver/storages/__init__.py b/src/auto_archiver/storages/__init__.py
index 7094d25..250004a 100644
--- a/src/auto_archiver/storages/__init__.py
+++ b/src/auto_archiver/storages/__init__.py
@@ -1,9 +1,3 @@
-# we need to explicitly expose the available imports here
-from .base_storage import Storage
-# from .local_storage import LocalStorage, LocalConfig
-# from .s3_storage import S3Config, S3Storage
-# from .gd_storage import GDConfig, GDStorage
-
-from .storage import StorageV2
-from .s3 import S3StorageV2
-from .local import LocalStorageV2
\ No newline at end of file
+from .storage import Storage
+from .s3 import S3Storage
+from .local import LocalStorage
\ No newline at end of file
diff --git a/src/auto_archiver/storages/base_storage.py b/src/auto_archiver/storages/base_storage.py
deleted file mode 100644
index f147678..0000000
--- a/src/auto_archiver/storages/base_storage.py
+++ /dev/null
@@ -1,33 +0,0 @@
-import os, uuid
-from loguru import logger
-from abc import ABC, abstractmethod
-from pathlib import Path
-
-
-class Storage(ABC):
- TMP_FOLDER = "tmp/"
-
- @abstractmethod
- def __init__(self, config): pass
-
- @abstractmethod
- def get_cdn_url(self, key): pass
-
- @abstractmethod
- def exists(self, key): pass
-
- @abstractmethod
- def uploadf(self, file, key, **kwargs): pass
-
- def clean_key(self, key):
- # Some storages does not work well with trailing forward slashes and some keys come with that
- if key.startswith('/'):
- logger.debug(f'Found and fixed a leading "/" for {key=}')
- return key[1:]
- return key
-
-
- def upload(self, filename: str, key: str, **kwargs):
- logger.debug(f'[{self.__class__.__name__}] uploading file {filename} with key {key}')
- with open(filename, 'rb') as f:
- self.uploadf(f, key, **kwargs)
diff --git a/src/auto_archiver/storages/gd_storage.py b/src/auto_archiver/storages/gd_storage.py
index 3af77f1..09c8938 100644
--- a/src/auto_archiver/storages/gd_storage.py
+++ b/src/auto_archiver/storages/gd_storage.py
@@ -1,178 +1,181 @@
-import os, time
-from loguru import logger
-from .base_storage import Storage
-from dataclasses import dataclass
-from googleapiclient.discovery import build
-from googleapiclient.http import MediaFileUpload
-from google.oauth2 import service_account
+#TODO: refactor GDriveStorage before merging to main
+
+# import os, time
+
+# from loguru import logger
+# from .base_storage import Storage
+# from dataclasses import dataclass
+# from googleapiclient.discovery import build
+# from googleapiclient.http import MediaFileUpload
+# from google.oauth2 import service_account
-from google.oauth2.credentials import Credentials
-from google.auth.transport.requests import Request
+# from google.oauth2.credentials import Credentials
+# from google.auth.transport.requests import Request
-@dataclass
-class GDConfig:
- root_folder_id: str
- oauth_token_filename: str
- service_account: str = "service_account.json"
- folder: str = "default"
+# @dataclass
+# class GDConfig:
+# root_folder_id: str
+# oauth_token_filename: str
+# service_account: str = "service_account.json"
+# folder: str = "default"
-class GDStorage(Storage):
- def __init__(self, config: GDConfig):
- self.folder = config.folder
- self.root_folder_id = config.root_folder_id
+# class GDStorage(Storage):
+# def __init__(self, config: GDConfig):
+# self.folder = config.folder
+# self.root_folder_id = config.root_folder_id
- SCOPES=['https://www.googleapis.com/auth/drive']
+# SCOPES=['https://www.googleapis.com/auth/drive']
- token_file = config.oauth_token_filename
- if token_file is not None:
- """
- Tokens are refreshed after 1 hour
- however keep working for 7 days (tbc)
- so as long as the job doesn't last for 7 days
- then this method of refreshing only once per run will work
- see this link for details on the token
- https://davemateer.com/2022/04/28/google-drive-with-python#tokens
- """
- logger.debug(f'Using GD OAuth token {token_file}')
- creds = Credentials.from_authorized_user_file(token_file, SCOPES)
+# token_file = config.oauth_token_filename
+# if token_file is not None:
+# """
+# Tokens are refreshed after 1 hour
+# however keep working for 7 days (tbc)
+# so as long as the job doesn't last for 7 days
+# then this method of refreshing only once per run will work
+# see this link for details on the token
+# https://davemateer.com/2022/04/28/google-drive-with-python#tokens
+# """
+# logger.debug(f'Using GD OAuth token {token_file}')
+# creds = Credentials.from_authorized_user_file(token_file, SCOPES)
- if not creds or not creds.valid:
- if creds and creds.expired and creds.refresh_token:
- logger.debug('Requesting new GD OAuth token')
- creds.refresh(Request())
- else:
- raise Exception("Problem with creds - create the token again")
+# if not creds or not creds.valid:
+# if creds and creds.expired and creds.refresh_token:
+# logger.debug('Requesting new GD OAuth token')
+# creds.refresh(Request())
+# else:
+# raise Exception("Problem with creds - create the token again")
- # Save the credentials for the next run
- with open(token_file, 'w') as token:
- logger.debug('Saving new GD OAuth token')
- token.write(creds.to_json())
- else:
- logger.debug('GD OAuth Token valid')
- else:
- gd_service_account = config.service_account
- logger.debug(f'Using GD Service Account {gd_service_account}')
- creds = service_account.Credentials.from_service_account_file(gd_service_account, scopes=SCOPES)
+# # Save the credentials for the next run
+# with open(token_file, 'w') as token:
+# logger.debug('Saving new GD OAuth token')
+# token.write(creds.to_json())
+# else:
+# logger.debug('GD OAuth Token valid')
+# else:
+# gd_service_account = config.service_account
+# logger.debug(f'Using GD Service Account {gd_service_account}')
+# creds = service_account.Credentials.from_service_account_file(gd_service_account, scopes=SCOPES)
- self.service = build('drive', 'v3', credentials=creds)
+# self.service = build('drive', 'v3', credentials=creds)
- def get_cdn_url(self, key):
- """
- only support files saved in a folder for GD
- S3 supports folder and all stored in the root
- """
- key = self.clean_key(key)
+# def get_cdn_url(self, key):
+# """
+# only support files saved in a folder for GD
+# S3 supports folder and all stored in the root
+# """
+# key = self.clean_key(key)
- full_name = os.path.join(self.folder, key)
- parent_id, folder_id = self.root_folder_id, None
- path_parts = full_name.split(os.path.sep)
- filename = path_parts[-1]
- logger.info(f"looking for folders for {path_parts[0:-1]} before uploading {filename=}")
- for folder in path_parts[0:-1]:
- folder_id = self._get_id_from_parent_and_name(parent_id, folder, use_mime_type=True, raise_on_missing=True)
- parent_id = folder_id
+# full_name = os.path.join(self.folder, key)
+# parent_id, folder_id = self.root_folder_id, None
+# path_parts = full_name.split(os.path.sep)
+# filename = path_parts[-1]
+# logger.info(f"looking for folders for {path_parts[0:-1]} before uploading {filename=}")
+# for folder in path_parts[0:-1]:
+# folder_id = self._get_id_from_parent_and_name(parent_id, folder, use_mime_type=True, raise_on_missing=True)
+# parent_id = folder_id
- # get id of file inside folder (or sub folder)
- file_id = self._get_id_from_parent_and_name(folder_id, filename)
- return f"https://drive.google.com/file/d/{file_id}/view?usp=sharing"
+# # get id of file inside folder (or sub folder)
+# file_id = self._get_id_from_parent_and_name(folder_id, filename)
+# return f"https://drive.google.com/file/d/{file_id}/view?usp=sharing"
- def exists(self, key):
- try:
- self.get_cdn_url(key)
- return True
- except: return False
+# def exists(self, key):
+# try:
+# self.get_cdn_url(key)
+# return True
+# except: return False
- def uploadf(self, file: str, key: str, **_kwargs):
- """
- 1. for each sub-folder in the path check if exists or create
- 2. upload file to root_id/other_paths.../filename
- """
- key = self.clean_key(key)
+# def uploadf(self, file: str, key: str, **_kwargs):
+# """
+# 1. for each sub-folder in the path check if exists or create
+# 2. upload file to root_id/other_paths.../filename
+# """
+# key = self.clean_key(key)
- full_name = os.path.join(self.folder, key)
- parent_id, upload_to = self.root_folder_id, None
- path_parts = full_name.split(os.path.sep)
- filename = path_parts[-1]
- logger.info(f"checking folders {path_parts[0:-1]} exist (or creating) before uploading {filename=}")
- for folder in path_parts[0:-1]:
- upload_to = self._get_id_from_parent_and_name(parent_id, folder, use_mime_type=True, raise_on_missing=False)
- if upload_to is None:
- upload_to = self._mkdir(folder, parent_id)
- parent_id = upload_to
+# full_name = os.path.join(self.folder, key)
+# parent_id, upload_to = self.root_folder_id, None
+# path_parts = full_name.split(os.path.sep)
+# filename = path_parts[-1]
+# logger.info(f"checking folders {path_parts[0:-1]} exist (or creating) before uploading {filename=}")
+# for folder in path_parts[0:-1]:
+# upload_to = self._get_id_from_parent_and_name(parent_id, folder, use_mime_type=True, raise_on_missing=False)
+# if upload_to is None:
+# upload_to = self._mkdir(folder, parent_id)
+# parent_id = upload_to
- # upload file to gd
- logger.debug(f'uploading {filename=} to folder id {upload_to}')
- file_metadata = {
- 'name': [filename],
- 'parents': [upload_to]
- }
- media = MediaFileUpload(file, resumable=True)
- gd_file = self.service.files().create(body=file_metadata, media_body=media, fields='id').execute()
- logger.debug(f'uploadf: uploaded file {gd_file["id"]} succesfully in folder={upload_to}')
+# # upload file to gd
+# logger.debug(f'uploading {filename=} to folder id {upload_to}')
+# file_metadata = {
+# 'name': [filename],
+# 'parents': [upload_to]
+# }
+# media = MediaFileUpload(file, resumable=True)
+# gd_file = self.service.files().create(body=file_metadata, media_body=media, fields='id').execute()
+# logger.debug(f'uploadf: uploaded file {gd_file["id"]} succesfully in folder={upload_to}')
- def upload(self, filename: str, key: str, **kwargs):
- # GD only requires the filename not a file reader
- self.uploadf(filename, key, **kwargs)
+# def upload(self, filename: str, key: str, **kwargs):
+# # GD only requires the filename not a file reader
+# self.uploadf(filename, key, **kwargs)
- # gets the Drive folderID if it is there
- def _get_id_from_parent_and_name(self, parent_id: str, name: str, retries: int = 1, sleep_seconds: int = 10, use_mime_type: bool = False, raise_on_missing: bool = True, use_cache=False):
- """
- Retrieves the id of a folder or file from its @name and the @parent_id folder
- Optionally does multiple @retries and sleeps @sleep_seconds between them
- If @use_mime_type will restrict search to "mimeType='application/vnd.google-apps.folder'"
- If @raise_on_missing will throw error when not found, or returns None
- Will remember previous calls to avoid duplication if @use_cache - might not have all edge cases tested, so use at own risk
- Returns the id of the file or folder from its name as a string
- """
- # cache logic
- if use_cache:
- self.api_cache = getattr(self, "api_cache", {})
- cache_key = f"{parent_id}_{name}_{use_mime_type}"
- if cache_key in self.api_cache:
- logger.debug(f"cache hit for {cache_key=}")
- return self.api_cache[cache_key]
+# # gets the Drive folderID if it is there
+# def _get_id_from_parent_and_name(self, parent_id: str, name: str, retries: int = 1, sleep_seconds: int = 10, use_mime_type: bool = False, raise_on_missing: bool = True, use_cache=False):
+# """
+# Retrieves the id of a folder or file from its @name and the @parent_id folder
+# Optionally does multiple @retries and sleeps @sleep_seconds between them
+# If @use_mime_type will restrict search to "mimeType='application/vnd.google-apps.folder'"
+# If @raise_on_missing will throw error when not found, or returns None
+# Will remember previous calls to avoid duplication if @use_cache - might not have all edge cases tested, so use at own risk
+# Returns the id of the file or folder from its name as a string
+# """
+# # cache logic
+# if use_cache:
+# self.api_cache = getattr(self, "api_cache", {})
+# cache_key = f"{parent_id}_{name}_{use_mime_type}"
+# if cache_key in self.api_cache:
+# logger.debug(f"cache hit for {cache_key=}")
+# return self.api_cache[cache_key]
- # API logic
- debug_header: str = f"[searching {name=} in {parent_id=}]"
- query_string = f"'{parent_id}' in parents and name = '{name}' and trashed = false "
- if use_mime_type:
- query_string += f" and mimeType='application/vnd.google-apps.folder' "
+# # API logic
+# debug_header: str = f"[searching {name=} in {parent_id=}]"
+# query_string = f"'{parent_id}' in parents and name = '{name}' and trashed = false "
+# if use_mime_type:
+# query_string += f" and mimeType='application/vnd.google-apps.folder' "
- for attempt in range(retries):
- results = self.service.files().list(
- q=query_string,
- spaces='drive', # ie not appDataFolder or photos
- fields='files(id, name)'
- ).execute()
- items = results.get('files', [])
+# for attempt in range(retries):
+# results = self.service.files().list(
+# q=query_string,
+# spaces='drive', # ie not appDataFolder or photos
+# fields='files(id, name)'
+# ).execute()
+# items = results.get('files', [])
- if len(items) > 0:
- logger.debug(f"{debug_header} found {len(items)} matches, returning last of {','.join([i['id'] for i in items])}")
- _id = items[-1]['id']
- if use_cache: self.api_cache[cache_key] = _id
- return _id
- else:
- logger.debug(f'{debug_header} not found, attempt {attempt+1}/{retries}.')
- if attempt < retries - 1:
- logger.debug(f'sleeping for {sleep_seconds} second(s)')
- time.sleep(sleep_seconds)
+# if len(items) > 0:
+# logger.debug(f"{debug_header} found {len(items)} matches, returning last of {','.join([i['id'] for i in items])}")
+# _id = items[-1]['id']
+# if use_cache: self.api_cache[cache_key] = _id
+# return _id
+# else:
+# logger.debug(f'{debug_header} not found, attempt {attempt+1}/{retries}.')
+# if attempt < retries - 1:
+# logger.debug(f'sleeping for {sleep_seconds} second(s)')
+# time.sleep(sleep_seconds)
- if raise_on_missing:
- raise ValueError(f'{debug_header} not found after {retries} attempt(s)')
- return None
+# if raise_on_missing:
+# raise ValueError(f'{debug_header} not found after {retries} attempt(s)')
+# return None
- def _mkdir(self, name: str, parent_id: str):
- """
- Creates a new GDrive folder @name inside folder @parent_id
- Returns id of the created folder
- """
- logger.debug(f'Creating new folder with {name=} inside {parent_id=}')
- file_metadata = {
- 'name': [name],
- 'mimeType': 'application/vnd.google-apps.folder',
- 'parents': [parent_id]
- }
- gd_folder = self.service.files().create(body=file_metadata, fields='id').execute()
- return gd_folder.get('id')
+# def _mkdir(self, name: str, parent_id: str):
+# """
+# Creates a new GDrive folder @name inside folder @parent_id
+# Returns id of the created folder
+# """
+# logger.debug(f'Creating new folder with {name=} inside {parent_id=}')
+# file_metadata = {
+# 'name': [name],
+# 'mimeType': 'application/vnd.google-apps.folder',
+# 'parents': [parent_id]
+# }
+# gd_folder = self.service.files().create(body=file_metadata, fields='id').execute()
+# return gd_folder.get('id')
diff --git a/src/auto_archiver/storages/local.py b/src/auto_archiver/storages/local.py
index 1d44e6f..f5768a9 100644
--- a/src/auto_archiver/storages/local.py
+++ b/src/auto_archiver/storages/local.py
@@ -8,10 +8,10 @@ from slugify import slugify
from ..core import Metadata
from ..core import Media
-from ..storages import StorageV2
+from ..storages import Storage
-class LocalStorageV2(StorageV2):
+class LocalStorage(Storage):
name = "local_storage"
def __init__(self, config: dict) -> None:
diff --git a/src/auto_archiver/storages/s3.py b/src/auto_archiver/storages/s3.py
index 25eb324..a88e2b2 100644
--- a/src/auto_archiver/storages/s3.py
+++ b/src/auto_archiver/storages/s3.py
@@ -4,12 +4,12 @@ import boto3, uuid, os, mimetypes
from botocore.errorfactory import ClientError
from ..core import Metadata
from ..core import Media
-from ..storages import StorageV2
+from ..storages import Storage
from loguru import logger
from slugify import slugify
-class S3StorageV2(StorageV2):
+class S3Storage(Storage):
name = "s3_storage"
def __init__(self, config: dict) -> None:
diff --git a/src/auto_archiver/storages/s3_storage.py b/src/auto_archiver/storages/s3_storage.py
deleted file mode 100644
index 563d2ea..0000000
--- a/src/auto_archiver/storages/s3_storage.py
+++ /dev/null
@@ -1,80 +0,0 @@
-import uuid, os, mimetypes
-from dataclasses import dataclass
-
-import boto3
-from botocore.errorfactory import ClientError
-
-from .base_storage import Storage
-from dataclasses import dataclass
-from loguru import logger
-
-
-@dataclass
-class S3Config:
- bucket: str
- region: str
- key: str
- secret: str
- folder: str = ""
- endpoint_url: str = "https://{region}.digitaloceanspaces.com"
- cdn_url: str = "https://{bucket}.{region}.cdn.digitaloceanspaces.com/{key}"
- private: bool = False
- key_path: str = "default" # 'default' uses full naming, 'random' uses generated uuid
-
-
-class S3Storage(Storage):
-
- def __init__(self, config: S3Config):
- self.bucket = config.bucket
- self.region = config.region
- self.folder = config.folder
- self.private = config.private
- self.cdn_url = config.cdn_url
- self.key_path = config.key_path
- self.key_dict = {}
-
- self.s3 = boto3.client(
- 's3',
- region_name=config.region,
- endpoint_url=config.endpoint_url.format(region=config.region),
- aws_access_key_id=config.key,
- aws_secret_access_key=config.secret
- )
-
- def _get_path(self, key):
- """
- Depends on the self.key_path configuration:
- * random - assigns a random UUID which can be used in conjunction with "private=false" to have unguessable documents publicly available -> self.folder/randomUUID
- * default -> defaults to self.folder/key
- """
- # defaults to /key
- final_key = key
- if self.key_path == "random":
- if key not in self.key_dict:
- ext = os.path.splitext(key)[1]
- self.key_dict[key] = f"{str(uuid.uuid4())}{ext}"
- final_key = self.key_dict[key]
- return os.path.join(self.folder, final_key)
-
- def get_cdn_url(self, key):
- return self.cdn_url.format(bucket=self.bucket, region=self.region, key=self._get_path(key))
-
- def exists(self, key):
- try:
- self.s3.head_object(Bucket=self.bucket, Key=self._get_path(key))
- return True
- except ClientError:
- return False
-
- def uploadf(self, file, key, **kwargs):
- extra_args = kwargs.get("extra_args", {})
- if not self.private and 'ACL' not in extra_args:
- extra_args['ACL'] = 'public-read'
-
- if 'ContentType' not in extra_args:
- try:
- extra_args['ContentType'] = mimetypes.guess_type(key)[0]
- except Exception as e:
- logger.error(f"Unable to get mimetype for {key=}, error: {e}")
-
- self.s3.upload_fileobj(file, Bucket=self.bucket, Key=self._get_path(key), ExtraArgs=extra_args)
diff --git a/src/auto_archiver/storages/storage.py b/src/auto_archiver/storages/storage.py
index 9d6a005..3e4134a 100644
--- a/src/auto_archiver/storages/storage.py
+++ b/src/auto_archiver/storages/storage.py
@@ -10,7 +10,7 @@ from slugify import slugify
@dataclass
-class StorageV2(Step):
+class Storage(Step):
name = "storage"
def __init__(self, config: dict) -> None:
@@ -18,8 +18,8 @@ class StorageV2(Step):
super().__init__(config)
# only for typing...
- def init(name: str, config: dict) -> StorageV2:
- return Step.init(name, config, StorageV2)
+ def init(name: str, config: dict) -> Storage:
+ return Step.init(name, config, Storage)
def store(self, media: Media, item: Metadata) -> None:
self.set_key(media, item)