reduce s3 duplicating while keeping random urls via hash (#112)

This commit is contained in:
Miguel Sozinho Ramalho
2023-12-12 19:12:03 +00:00
committed by GitHub
parent 9ee323a654
commit 3e56ef137d
9 changed files with 355 additions and 294 deletions

View File

@@ -1,9 +1,11 @@
import json, os, traceback, uuid
import json, os, traceback
import tiktok_downloader
from loguru import logger
from . import Archiver
from ..core import Metadata, Media, ArchivingContext
from ..utils.misc import random_str
class TiktokArchiver(Archiver):
@@ -37,7 +39,7 @@ class TiktokArchiver(Archiver):
logger.warning(f'Other Tiktok error {error}')
try:
filename = os.path.join(ArchivingContext.get_tmp_dir(), f'{str(uuid.uuid4())[0:8]}.mp4')
filename = os.path.join(ArchivingContext.get_tmp_dir(), f'{random_str(8)}.mp4')
tiktok_media = tiktok_downloader.snaptik(url).get_media()
if len(tiktok_media) <= 0:

View File

@@ -1,9 +1,10 @@
from loguru import logger
import time, uuid, os
import time, os
from selenium.common.exceptions import TimeoutException
from . import Enricher
from ..utils import Webdriver, UrlUtil
from ..utils import Webdriver, UrlUtil, random_str
from ..core import Media, Metadata, ArchivingContext
class ScreenshotEnricher(Enricher):
@@ -29,7 +30,7 @@ class ScreenshotEnricher(Enricher):
try:
driver.get(url)
time.sleep(int(self.sleep_before_screenshot))
screenshot_file = os.path.join(ArchivingContext.get_tmp_dir(), f"screenshot_{str(uuid.uuid4())[0:8]}.png")
screenshot_file = os.path.join(ArchivingContext.get_tmp_dir(), f"screenshot_{random_str(8)}.png")
driver.save_screenshot(screenshot_file)
to_enrich.add_media(Media(filename=screenshot_file), id="screenshot")
except TimeoutException:

View File

@@ -1,8 +1,9 @@
import ffmpeg, os, uuid
import ffmpeg, os
from loguru import logger
from . import Enricher
from ..core import Media, Metadata, ArchivingContext
from ..utils.misc import random_str
class ThumbnailEnricher(Enricher):
@@ -23,7 +24,7 @@ class ThumbnailEnricher(Enricher):
logger.debug(f"generating thumbnails")
for i, m in enumerate(to_enrich.media[::]):
if m.is_video():
folder = os.path.join(ArchivingContext.get_tmp_dir(), str(uuid.uuid4()))
folder = os.path.join(ArchivingContext.get_tmp_dir(), random_str(24))
os.makedirs(folder, exist_ok=True)
logger.debug(f"generating thumbnails for {m.filename}")
fps, duration = 0.5, m.get("duration")

View File

@@ -1,6 +1,6 @@
import jsonlines
import mimetypes
import os, shutil, subprocess, uuid
import os, shutil, subprocess
from zipfile import ZipFile
from loguru import logger
from warcio.archiveiterator import ArchiveIterator
@@ -8,7 +8,7 @@ from warcio.archiveiterator import ArchiveIterator
from ..core import Media, Metadata, ArchivingContext
from . import Enricher
from ..archivers import Archiver
from ..utils import UrlUtil
from ..utils import UrlUtil, random_str
class WaczArchiverEnricher(Enricher, Archiver):
@@ -47,7 +47,7 @@ class WaczArchiverEnricher(Enricher, Archiver):
url = to_enrich.get_url()
collection = str(uuid.uuid4())[0:8]
collection = random_str(8)
browsertrix_home_host = os.environ.get('BROWSERTRIX_HOME_HOST') or os.path.abspath(ArchivingContext.get_tmp_dir())
browsertrix_home_container = os.environ.get('BROWSERTRIX_HOME_CONTAINER') or browsertrix_home_host

View File

@@ -1,6 +1,6 @@
from __future__ import annotations
from dataclasses import dataclass
import mimetypes, uuid, os, pathlib
import mimetypes, os, pathlib
from jinja2 import Environment, FileSystemLoader
from urllib.parse import quote
from loguru import logger
@@ -9,6 +9,7 @@ from ..version import __version__
from ..core import Metadata, Media, ArchivingContext
from . import Formatter
from ..enrichers import HashEnricher
from ..utils.misc import random_str
@dataclass
@@ -44,7 +45,7 @@ class HtmlFormatter(Formatter):
metadata=item.metadata,
version=__version__
)
html_path = os.path.join(ArchivingContext.get_tmp_dir(), f"formatted{str(uuid.uuid4())}.html")
html_path = os.path.join(ArchivingContext.get_tmp_dir(), f"formatted{random_str(24)}.html")
with open(html_path, mode="w", encoding="utf-8") as outf:
outf.write(content)
final_media = Media(filename=html_path, _mimetype="text/html")

View File

@@ -1,14 +1,14 @@
from typing import IO, Any
import boto3, uuid, os, mimetypes
from botocore.errorfactory import ClientError
from ..core import Metadata
from typing import IO
import boto3, os
from ..utils.misc import random_str
from ..core import Media
from ..storages import Storage
from ..enrichers import HashEnricher
from loguru import logger
from slugify import slugify
NO_DUPLICATES_FOLDER = "no-dups/"
class S3Storage(Storage):
name = "s3_storage"
@@ -21,6 +21,9 @@ class S3Storage(Storage):
aws_access_key_id=self.key,
aws_secret_access_key=self.secret
)
self.random_no_duplicate = bool(self.random_no_duplicate)
if self.random_no_duplicate:
logger.warning("random_no_duplicate is set to True, this will override `path_generator`, `filename_generator` and `folder`.")
@staticmethod
def configs() -> dict:
@@ -31,7 +34,7 @@ class S3Storage(Storage):
"region": {"default": None, "help": "S3 region name"},
"key": {"default": None, "help": "S3 API key"},
"secret": {"default": None, "help": "S3 API secret"},
# TODO: how to have sth like a custom folder? has to come from the feeders
"random_no_duplicate": {"default": False, "help": f"if set, it will override `path_generator`, `filename_generator` and `folder`. It will check if the file already exists and if so it will not upload it again. Creates a new root folder path `{NO_DUPLICATES_FOLDER}`"},
"endpoint_url": {
"default": 'https://{region}.digitaloceanspaces.com',
"help": "S3 bucket endpoint, {region} are inserted at runtime"
@@ -47,6 +50,22 @@ class S3Storage(Storage):
return self.cdn_url.format(bucket=self.bucket, region=self.region, key=media.key)
def uploadf(self, file: IO[bytes], media: Media, **kwargs: dict) -> None:
if not self.is_upload_needed(media): return True
if self.random_no_duplicate:
# checks if a folder with the hash already exists, if so it skips the upload
he = HashEnricher({"hash_enricher": {"algorithm": "SHA-256", "chunksize": 1.6e7}})
hd = he.calculate_hash(media.filename)
path = os.path.join(NO_DUPLICATES_FOLDER, hd[:24])
if existing_key:=self.file_in_folder(path):
media.key = existing_key
logger.debug(f"skipping upload of {media.filename} because it already exists in {media.key}")
return True
_, ext = os.path.splitext(media.key)
media.key = os.path.join(path, f"{random_str(24)}{ext}")
extra_args = kwargs.get("extra_args", {})
if not self.private and 'ACL' not in extra_args:
extra_args['ACL'] = 'public-read'
@@ -60,14 +79,30 @@ class S3Storage(Storage):
self.s3.upload_fileobj(file, Bucket=self.bucket, Key=media.key, ExtraArgs=extra_args)
return True
def is_upload_needed(self, media: Media) -> bool:
if self.random_no_duplicate:
# checks if a folder with the hash already exists, if so it skips the upload
he = HashEnricher({"hash_enricher": {"algorithm": "SHA-256", "chunksize": 1.6e7}})
hd = he.calculate_hash(media.filename)
path = os.path.join(NO_DUPLICATES_FOLDER, hd[:24])
if existing_key:=self.file_in_folder(path):
media.key = existing_key
logger.debug(f"skipping upload of {media.filename} because it already exists in {media.key}")
return False
_, ext = os.path.splitext(media.key)
media.key = os.path.join(path, f"{random_str(24)}{ext}")
return True
def file_in_folder(self, path:str) -> str:
# checks if path exists and is not an empty folder
if not path.endswith('/'):
path = path + '/'
resp = self.s3.list_objects(Bucket=self.bucket, Prefix=path, Delimiter='/', MaxKeys=1)
if 'Contents' in resp:
return resp['Contents'][0]['Key']
return False
# def exists(self, key: str) -> bool:
# """
# Tests if a given file with key=key exists in the bucket
# """
# try:
# self.s3.head_object(Bucket=self.bucket, Key=key)
# return True
# except ClientError as e:
# logger.warning(f"got a ClientError when checking if {key=} exists in bucket={self.bucket}: {e}")
# return False

View File

@@ -2,11 +2,13 @@ from __future__ import annotations
from abc import abstractmethod
from dataclasses import dataclass
from typing import IO
import os
from ..utils.misc import random_str
from ..core import Media, Step, ArchivingContext
from ..enrichers import HashEnricher
from loguru import logger
import os, uuid
from slugify import slugify
@@ -72,10 +74,10 @@ class Storage(Step):
filename = slugify(filename) # in case it comes with os.sep
elif self.path_generator == "url": path = slugify(url)
elif self.path_generator == "random":
path = ArchivingContext.get("random_path", str(uuid.uuid4())[:16], True)
path = ArchivingContext.get("random_path", random_str(24), True)
# filename_generator logic
if self.filename_generator == "random": filename = str(uuid.uuid4())[:16]
if self.filename_generator == "random": filename = random_str(24)
elif self.filename_generator == "static":
he = HashEnricher({"hash_enricher": {"algorithm": ArchivingContext.get("hash_enricher.algorithm"), "chunksize": 1.6e7}})
hd = he.calculate_hash(media.filename)

View File

@@ -1,5 +1,6 @@
import os, json, requests
import uuid
from datetime import datetime
from loguru import logger
@@ -49,3 +50,7 @@ def update_nested_dict(dictionary, update_dict):
update_nested_dict(dictionary[key], value)
else:
dictionary[key] = value
def random_str(length: int = 32) -> str:
assert length <= 32, "length must be less than 32 as UUID4 is used"
return str(uuid.uuid4()).replace("-", "")[:length]