Compare commits

..

30 Commits

Author SHA1 Message Date
msramalho
345e03e916 enables option to toggle db api writes 2023-12-13 12:54:12 +00:00
msramalho
0a3053bbc7 version update 2023-12-13 11:29:13 +00:00
Miguel Sozinho Ramalho
e69660be82 chooses most complete result from api (#117) 2023-12-13 11:28:27 +00:00
Miguel Sozinho Ramalho
a786d4bb0e chooses most complete result from api (#116) 2023-12-13 11:26:46 +00:00
Miguel Sozinho Ramalho
128d4136e3 fixes empty api search results (#115) 2023-12-13 10:51:25 +00:00
Miguel Sozinho Ramalho
98fb574d89 fixing older db entries formats (#114) 2023-12-12 22:47:54 +00:00
Miguel Sozinho Ramalho
6f36e92e02 enables api_db cache queries if configured with new option (#113) 2023-12-12 19:20:26 +00:00
Miguel Sozinho Ramalho
3e56ef137d reduce s3 duplicating while keeping random urls via hash (#112) 2023-12-12 19:12:03 +00:00
Jett Chen
9ee323a654 Set _mimetype for final media of html formatter (#111) 2023-12-11 11:47:04 +00:00
Kai
9eb39943c7 Extract text in wacz_enricher (#110) 2023-12-05 22:24:12 +00:00
msramalho
8624e9f177 version update 0.7.1 2023-11-13 11:58:43 +01:00
Galen Reich
381940f5a8 Fix Selenium headless invokation (#106)
Co-authored-by: msramalho <19508417+msramalho@users.noreply.github.com>
2023-11-13 11:56:35 +01:00
msramalho
1382f8b795 version bump and release without commit 2023-09-22 10:18:58 +01:00
Dave Mateer
fac8364762 Updated gd.py to work with shared folders (#102)
Co-authored-by: msramalho <19508417+msramalho@users.noreply.github.com>
2023-09-22 10:17:54 +01:00
msramalho
0feeb0bd24 Bump version to v0.6.12 for release 2023-09-20 10:18:44 +01:00
msramalho
ddb9dc87d7 unfortunately needed twitter->x 2023-09-20 10:17:31 +01:00
msramalho
e8935b9a80 Bump version to v0.6.11 for release 2023-09-15 19:53:07 +01:00
msramalho
b157f9a6b1 renaming variable 2023-09-15 19:52:47 +01:00
msramalho
ea38a604bb fixes #96 by not assigning to self.prop 2023-09-15 19:35:35 +01:00
msramalho
53494c961e Bump version to v0.6.10 for release 2023-09-14 17:50:08 +01:00
Kai
f7839a99cc Add configs for path to write and read wacz archives (#93)
Co-authored-by: msramalho <19508417+msramalho@users.noreply.github.com>
2023-09-14 17:49:37 +01:00
msramalho
7a2119e6e9 Bump version to v0.6.9 for release 2023-09-12 20:08:00 +01:00
Miguel Sozinho Ramalho
3ae25e51e7 adds flexibile setup for wacz in docker (#94) 2023-09-12 20:07:21 +01:00
msramalho
9584193d69 Bump version to v0.6.8 for release 2023-09-08 15:10:02 +01:00
msramalho
0dd45d90f1 fix: docker+wacz troubles 2023-09-08 15:09:50 +01:00
msramalho
edcb2da74a Bump version to v0.6.7 for release 2023-09-06 17:07:14 +01:00
msramalho
17d9bf694f fix docker image so as not to remove browsertrix files 2023-09-06 17:07:10 +01:00
Miguel Sozinho Ramalho
368395ffa8 Merge pull request #88 from djhmateer/v6-test 2023-08-28 11:09:28 +01:00
Miguel Sozinho Ramalho
21d7d2e16c format youtubedl_archiver.py 2023-08-28 11:09:03 +01:00
Dave Mateer
0bbb4c9b08 Added noplaylist true to youtubedl so that videos in playlists will work 2023-08-27 17:26:36 +01:00
26 changed files with 1118 additions and 962 deletions

View File

@@ -4,7 +4,6 @@ ENV RUNNING_IN_DOCKER=1
WORKDIR /app WORKDIR /app
# TODO: use custom ffmpeg builds instead of apt-get install
RUN pip install --upgrade pip && \ RUN pip install --upgrade pip && \
pip install pipenv && \ pip install pipenv && \
add-apt-repository ppa:mozillateam/ppa && \ add-apt-repository ppa:mozillateam/ppa && \
@@ -18,7 +17,6 @@ RUN pip install --upgrade pip && \
rm geckodriver-v* rm geckodriver-v*
# TODO: avoid copying unnecessary files, including .git
COPY Pipfile* ./ COPY Pipfile* ./
# install from pipenv, with browsertrix-only requirements # install from pipenv, with browsertrix-only requirements
RUN pipenv install && \ RUN pipenv install && \
@@ -27,12 +25,7 @@ RUN pipenv install && \
# doing this at the end helps during development, builds are quick # doing this at the end helps during development, builds are quick
COPY ./src/ . COPY ./src/ .
# TODO: figure out how to make volumes not be root, does it depend on host or dockerfile?
# RUN useradd --system --groups sudo --shell /bin/bash archiver && chown -R archiver:sudo .
# USER archiver
ENTRYPOINT ["pipenv", "run", "python3", "-m", "auto_archiver"] ENTRYPOINT ["pipenv", "run", "python3", "-m", "auto_archiver"]
# should be executed with 2 volumes (3 if local_storage is used) # should be executed with 2 volumes (3 if local_storage is used)
# docker run --rm -v $PWD/secrets:/app/secrets -v $PWD/local_archive:/app/local_archive aa pipenv run python3 -m auto_archiver --config secrets/orchestration.yaml # docker run --rm -v $PWD/secrets:/app/secrets -v $PWD/local_archive:/app/local_archive aa pipenv run python3 -m auto_archiver --config secrets/orchestration.yaml

View File

@@ -35,6 +35,7 @@ vk-url-scraper = "*"
requests = {extras = ["socks"], version = "*"} requests = {extras = ["socks"], version = "*"}
numpy = "*" numpy = "*"
warcio = "*" warcio = "*"
jsonlines = "*"
[dev-packages] [dev-packages]
autopep8 = "*" autopep8 = "*"

1752
Pipfile.lock generated

File diff suppressed because it is too large Load Diff

View File

@@ -8,8 +8,8 @@ TAG=$(python -c 'from src.auto_archiver.version import __version__; print("v" +
read -p "Creating new release for $TAG. Do you want to continue? [Y/n] " prompt read -p "Creating new release for $TAG. Do you want to continue? [Y/n] " prompt
if [[ $prompt == "y" || $prompt == "Y" || $prompt == "yes" || $prompt == "Yes" ]]; then if [[ $prompt == "y" || $prompt == "Y" || $prompt == "yes" || $prompt == "Yes" ]]; then
git add -A # git add -A
git commit -m "Bump version to $TAG for release" || true && git push # git commit -m "Bump version to $TAG for release" || true && git push
echo "Creating new git tag $TAG" echo "Creating new git tag $TAG"
git tag "$TAG" -m "$TAG" git tag "$TAG" -m "$TAG"
git push --tags git push --tags

View File

@@ -1,9 +1,11 @@
import json, os, traceback, uuid import json, os, traceback
import tiktok_downloader import tiktok_downloader
from loguru import logger from loguru import logger
from . import Archiver from . import Archiver
from ..core import Metadata, Media, ArchivingContext from ..core import Metadata, Media, ArchivingContext
from ..utils.misc import random_str
class TiktokArchiver(Archiver): class TiktokArchiver(Archiver):
@@ -37,7 +39,7 @@ class TiktokArchiver(Archiver):
logger.warning(f'Other Tiktok error {error}') logger.warning(f'Other Tiktok error {error}')
try: try:
filename = os.path.join(ArchivingContext.get_tmp_dir(), f'{str(uuid.uuid4())[0:8]}.mp4') filename = os.path.join(ArchivingContext.get_tmp_dir(), f'{random_str(8)}.mp4')
tiktok_media = tiktok_downloader.snaptik(url).get_media() tiktok_media = tiktok_downloader.snaptik(url).get_media()
if len(tiktok_media) <= 0: if len(tiktok_media) <= 0:

View File

@@ -15,8 +15,8 @@ class TwitterArchiver(Archiver):
""" """
name = "twitter_archiver" name = "twitter_archiver"
link_pattern = re.compile(r"twitter.com\/(?:\#!\/)?(\w+)\/status(?:es)?\/(\d+)") link_pattern = re.compile(r"(?:twitter|x).com\/(?:\#!\/)?(\w+)\/status(?:es)?\/(\d+)")
link_clean_pattern = re.compile(r"(.+twitter\.com\/.+\/\d+)(\?)*.*") link_clean_pattern = re.compile(r"(.+(?:twitter|x)\.com\/.+\/\d+)(\?)*.*")
def __init__(self, config: dict) -> None: def __init__(self, config: dict) -> None:
super().__init__(config) super().__init__(config)

View File

@@ -25,7 +25,7 @@ class YoutubeDLArchiver(Archiver):
logger.debug('Using Facebook cookie') logger.debug('Using Facebook cookie')
yt_dlp.utils.std_headers['cookie'] = self.facebook_cookie yt_dlp.utils.std_headers['cookie'] = self.facebook_cookie
ydl = yt_dlp.YoutubeDL({'outtmpl': os.path.join(ArchivingContext.get_tmp_dir(), f'%(id)s.%(ext)s'), 'quiet': False}) ydl = yt_dlp.YoutubeDL({'outtmpl': os.path.join(ArchivingContext.get_tmp_dir(), f'%(id)s.%(ext)s'), 'quiet': False, 'noplaylist': True})
try: try:
# don'd download since it can be a live stream # don'd download since it can be a live stream

View File

@@ -65,7 +65,9 @@ class Media:
@property # getter .mimetype @property # getter .mimetype
def mimetype(self) -> str: def mimetype(self) -> str:
assert self.filename is not None and len(self.filename) > 0, "cannot get mimetype from media without filename" if not self.filename or len(self.filename) == 0:
logger.warning(f"cannot get mimetype from media without filename: {self}")
return ""
if not self._mimetype: if not self._mimetype:
self._mimetype = mimetypes.guess_type(self.filename)[0] self._mimetype = mimetypes.guess_type(self.filename)[0]
return self._mimetype or "" return self._mimetype or ""

View File

@@ -105,7 +105,8 @@ class Metadata:
def get_timestamp(self, utc=True, iso=True) -> datetime.datetime: def get_timestamp(self, utc=True, iso=True) -> datetime.datetime:
ts = self.get("timestamp") ts = self.get("timestamp")
if not ts: return ts if not ts: return
if type(ts) == float: ts = datetime.datetime.fromtimestamp(ts)
if utc: ts = ts.replace(tzinfo=datetime.timezone.utc) if utc: ts = ts.replace(tzinfo=datetime.timezone.utc)
if iso: return ts.isoformat() if iso: return ts.isoformat()
return ts return ts
@@ -164,3 +165,16 @@ class Metadata:
def __str__(self) -> str: def __str__(self) -> str:
return self.__repr__() return self.__repr__()
@staticmethod
def choose_most_complete(results: List[Metadata]) -> Metadata:
# returns the most complete result from a list of results
# prioritizes results with more media, then more metadata
if len(results) == 0: return None
if len(results) == 1: return results[0]
most_complete = results[0]
for r in results[1:]:
if len(r.media) > len(most_complete.media): most_complete = r
elif len(r.media) == len(most_complete.media) and len(r.metadata) > len(most_complete.metadata): most_complete = r
return most_complete

View File

@@ -77,7 +77,7 @@ class ArchivingOrchestrator:
if cached_result: if cached_result:
logger.debug("Found previously archived entry") logger.debug("Found previously archived entry")
for d in self.databases: for d in self.databases:
d.done(cached_result) d.done(cached_result, cached=True)
return cached_result return cached_result
# 3 - call archivers until one succeeds # 3 - call archivers until one succeeds

View File

@@ -1,3 +1,4 @@
from typing import Union
import requests, os import requests, os
from loguru import logger from loguru import logger
@@ -14,6 +15,8 @@ class AAApiDb(Database):
def __init__(self, config: dict) -> None: def __init__(self, config: dict) -> None:
# without this STEP.__init__ is not called # without this STEP.__init__ is not called
super().__init__(config) super().__init__(config)
self.allow_rearchive = bool(self.allow_rearchive)
self.store_results = bool(self.store_results)
self.assert_valid_string("api_endpoint") self.assert_valid_string("api_endpoint")
self.assert_valid_string("api_secret") self.assert_valid_string("api_secret")
@@ -21,16 +24,40 @@ class AAApiDb(Database):
def configs() -> dict: def configs() -> dict:
return { return {
"api_endpoint": {"default": None, "help": "API endpoint where calls are made to"}, "api_endpoint": {"default": None, "help": "API endpoint where calls are made to"},
"api_secret": {"default": None, "help": "API authentication secret"}, "api_secret": {"default": None, "help": "API Basic authentication secret [deprecating soon]"},
"api_token": {"default": None, "help": "API Bearer token, to be preferred over secret (Basic auth) going forward"},
"public": {"default": False, "help": "whether the URL should be publicly available via the API"}, "public": {"default": False, "help": "whether the URL should be publicly available via the API"},
"author_id": {"default": None, "help": "which email to assign as author"}, "author_id": {"default": None, "help": "which email to assign as author"},
"group_id": {"default": None, "help": "which group of users have access to the archive in case public=false as author"}, "group_id": {"default": None, "help": "which group of users have access to the archive in case public=false as author"},
"allow_rearchive": {"default": True, "help": "if False then the API database will be queried prior to any archiving operations and stop if the link has already been archived"},
"store_results": {"default": True, "help": "when set, will send the results to the API database."},
"tags": {"default": [], "help": "what tags to add to the archived URL", "cli_set": lambda cli_val, cur_val: set(cli_val.split(","))}, "tags": {"default": [], "help": "what tags to add to the archived URL", "cli_set": lambda cli_val, cur_val: set(cli_val.split(","))},
} }
def fetch(self, item: Metadata) -> Union[Metadata, bool]:
""" query the database for the existence of this item"""
if not self.allow_rearchive: return
params = {"url": item.get_url(), "limit": 15}
headers = {"Authorization": f"Bearer {self.api_token}", "accept": "application/json"}
response = requests.get(os.path.join(self.api_endpoint, "tasks/search-url"), params=params, headers=headers)
def done(self, item: Metadata) -> None: if response.status_code == 200:
if len(response.json()):
logger.success(f"API returned {len(response.json())} previously archived instance(s)")
fetched_metadata = [Metadata.from_dict(r["result"]) for r in response.json()]
return Metadata.choose_most_complete(fetched_metadata)
else:
logger.error(f"AA API FAIL ({response.status_code}): {response.json()}")
return False
def done(self, item: Metadata, cached: bool=False) -> None:
"""archival result ready - should be saved to DB""" """archival result ready - should be saved to DB"""
logger.info(f"saving archive of {item.get_url()} to the AA API.") if not self.store_results: return
if cached:
logger.debug(f"skipping saving archive of {item.get_url()} to the AA API because it was cached")
return
logger.debug(f"saving archive of {item.get_url()} to the AA API.")
payload = {'result': item.to_json(), 'public': self.public, 'author_id': self.author_id, 'group_id': self.group_id, 'tags': list(self.tags)} payload = {'result': item.to_json(), 'public': self.public, 'author_id': self.author_id, 'group_id': self.group_id, 'tags': list(self.tags)}
response = requests.post(os.path.join(self.api_endpoint, "submit-archive"), json=payload, auth=("abc", self.api_secret)) response = requests.post(os.path.join(self.api_endpoint, "submit-archive"), json=payload, auth=("abc", self.api_secret))
@@ -39,3 +66,5 @@ class AAApiDb(Database):
logger.success(f"AA API: {response.json()}") logger.success(f"AA API: {response.json()}")
else: else:
logger.error(f"AA API FAIL ({response.status_code}): {response.json()}") logger.error(f"AA API FAIL ({response.status_code}): {response.json()}")

View File

@@ -27,6 +27,6 @@ class ConsoleDb(Database):
def aborted(self, item: Metadata) -> None: def aborted(self, item: Metadata) -> None:
logger.warning(f"ABORTED {item}") logger.warning(f"ABORTED {item}")
def done(self, item: Metadata) -> None: def done(self, item: Metadata, cached: bool=False) -> None:
"""archival result ready - should be saved to DB""" """archival result ready - should be saved to DB"""
logger.success(f"DONE {item}") logger.success(f"DONE {item}")

View File

@@ -24,7 +24,7 @@ class CSVDb(Database):
"csv_file": {"default": "db.csv", "help": "CSV file name"} "csv_file": {"default": "db.csv", "help": "CSV file name"}
} }
def done(self, item: Metadata) -> None: def done(self, item: Metadata, cached: bool=False) -> None:
"""archival result ready - should be saved to DB""" """archival result ready - should be saved to DB"""
logger.success(f"DONE {item}") logger.success(f"DONE {item}")
is_empty = not os.path.isfile(self.csv_file) or os.path.getsize(self.csv_file) == 0 is_empty = not os.path.isfile(self.csv_file) or os.path.getsize(self.csv_file) == 0

View File

@@ -36,6 +36,6 @@ class Database(Step, ABC):
return False return False
@abstractmethod @abstractmethod
def done(self, item: Metadata) -> None: def done(self, item: Metadata, cached: bool=False) -> None:
"""archival result ready - should be saved to DB""" """archival result ready - should be saved to DB"""
pass pass

View File

@@ -41,7 +41,7 @@ class GsheetsDb(Database):
"""check if the given item has been archived already""" """check if the given item has been archived already"""
return False return False
def done(self, item: Metadata) -> None: def done(self, item: Metadata, cached: bool=False) -> None:
"""archival result ready - should be saved to DB""" """archival result ready - should be saved to DB"""
logger.success(f"DONE {item.get_url()}") logger.success(f"DONE {item.get_url()}")
gw, row = self._retrieve_gsheet(item) gw, row = self._retrieve_gsheet(item)
@@ -57,8 +57,10 @@ class GsheetsDb(Database):
cell_updates.append((row, col, final_value)) cell_updates.append((row, col, final_value))
except Exception as e: except Exception as e:
logger.error(f"Unable to batch {col}={final_value} due to {e}") logger.error(f"Unable to batch {col}={final_value} due to {e}")
status_message = item.status
cell_updates.append((row, 'status', item.status)) if cached:
status_message = f"[cached] {status_message}"
cell_updates.append((row, 'status', status_message))
media: Media = item.get_final_media() media: Media = item.get_final_media()
if hasattr(media, "urls"): if hasattr(media, "urls"):

View File

@@ -1,9 +1,10 @@
from loguru import logger from loguru import logger
import time, uuid, os import time, os
from selenium.common.exceptions import TimeoutException from selenium.common.exceptions import TimeoutException
from . import Enricher from . import Enricher
from ..utils import Webdriver, UrlUtil from ..utils import Webdriver, UrlUtil, random_str
from ..core import Media, Metadata, ArchivingContext from ..core import Media, Metadata, ArchivingContext
class ScreenshotEnricher(Enricher): class ScreenshotEnricher(Enricher):
@@ -29,7 +30,7 @@ class ScreenshotEnricher(Enricher):
try: try:
driver.get(url) driver.get(url)
time.sleep(int(self.sleep_before_screenshot)) time.sleep(int(self.sleep_before_screenshot))
screenshot_file = os.path.join(ArchivingContext.get_tmp_dir(), f"screenshot_{str(uuid.uuid4())[0:8]}.png") screenshot_file = os.path.join(ArchivingContext.get_tmp_dir(), f"screenshot_{random_str(8)}.png")
driver.save_screenshot(screenshot_file) driver.save_screenshot(screenshot_file)
to_enrich.add_media(Media(filename=screenshot_file), id="screenshot") to_enrich.add_media(Media(filename=screenshot_file), id="screenshot")
except TimeoutException: except TimeoutException:

View File

@@ -1,8 +1,9 @@
import ffmpeg, os, uuid import ffmpeg, os
from loguru import logger from loguru import logger
from . import Enricher from . import Enricher
from ..core import Media, Metadata, ArchivingContext from ..core import Media, Metadata, ArchivingContext
from ..utils.misc import random_str
class ThumbnailEnricher(Enricher): class ThumbnailEnricher(Enricher):
@@ -23,7 +24,7 @@ class ThumbnailEnricher(Enricher):
logger.debug(f"generating thumbnails") logger.debug(f"generating thumbnails")
for i, m in enumerate(to_enrich.media[::]): for i, m in enumerate(to_enrich.media[::]):
if m.is_video(): if m.is_video():
folder = os.path.join(ArchivingContext.get_tmp_dir(), str(uuid.uuid4())) folder = os.path.join(ArchivingContext.get_tmp_dir(), random_str(24))
os.makedirs(folder, exist_ok=True) os.makedirs(folder, exist_ok=True)
logger.debug(f"generating thumbnails for {m.filename}") logger.debug(f"generating thumbnails for {m.filename}")
fps, duration = 0.5, m.get("duration") fps, duration = 0.5, m.get("duration")

View File

@@ -1,5 +1,6 @@
import jsonlines
import mimetypes import mimetypes
import os, shutil, subprocess, uuid import os, shutil, subprocess
from zipfile import ZipFile from zipfile import ZipFile
from loguru import logger from loguru import logger
from warcio.archiveiterator import ArchiveIterator from warcio.archiveiterator import ArchiveIterator
@@ -7,7 +8,7 @@ from warcio.archiveiterator import ArchiveIterator
from ..core import Media, Metadata, ArchivingContext from ..core import Media, Metadata, ArchivingContext
from . import Enricher from . import Enricher
from ..archivers import Archiver from ..archivers import Archiver
from ..utils import UrlUtil from ..utils import UrlUtil, random_str
class WaczArchiverEnricher(Enricher, Archiver): class WaczArchiverEnricher(Enricher, Archiver):
@@ -27,6 +28,7 @@ class WaczArchiverEnricher(Enricher, Archiver):
def configs() -> dict: def configs() -> dict:
return { return {
"profile": {"default": None, "help": "browsertrix-profile (for profile generation see https://github.com/webrecorder/browsertrix-crawler#creating-and-using-browser-profiles)."}, "profile": {"default": None, "help": "browsertrix-profile (for profile generation see https://github.com/webrecorder/browsertrix-crawler#creating-and-using-browser-profiles)."},
"docker_commands": {"default": None, "help":"if a custom docker invocation is needed"},
"timeout": {"default": 120, "help": "timeout for WACZ generation in seconds"}, "timeout": {"default": 120, "help": "timeout for WACZ generation in seconds"},
"extract_media": {"default": True, "help": "If enabled all the images/videos/audio present in the WACZ archive will be extracted into separate Media. The .wacz file will be kept untouched."} "extract_media": {"default": True, "help": "If enabled all the images/videos/audio present in the WACZ archive will be extracted into separate Media. The .wacz file will be kept untouched."}
} }
@@ -45,52 +47,46 @@ class WaczArchiverEnricher(Enricher, Archiver):
url = to_enrich.get_url() url = to_enrich.get_url()
collection = str(uuid.uuid4())[0:8] collection = random_str(8)
browsertrix_home = os.path.abspath(ArchivingContext.get_tmp_dir()) browsertrix_home_host = os.environ.get('BROWSERTRIX_HOME_HOST') or os.path.abspath(ArchivingContext.get_tmp_dir())
browsertrix_home_container = os.environ.get('BROWSERTRIX_HOME_CONTAINER') or browsertrix_home_host
if os.getenv('RUNNING_IN_DOCKER'): cmd = [
"crawl",
"--url", url,
"--scopeType", "page",
"--generateWACZ",
"--text",
"--screenshot", "fullPage",
"--collection", collection,
"--id", collection,
"--saveState", "never",
"--behaviors", "autoscroll,autoplay,autofetch,siteSpecific",
"--behaviorTimeout", str(self.timeout),
"--timeout", str(self.timeout)]
# call docker if explicitly enabled or we are running on the host (not in docker)
use_docker = os.environ.get('WACZ_ENABLE_DOCKER') or not os.environ.get('RUNNING_IN_DOCKER')
if use_docker:
logger.debug(f"generating WACZ in Docker for {url=}")
logger.debug(f"{browsertrix_home_host=} {browsertrix_home_container=}")
if self.docker_commands:
cmd = self.docker_commands + cmd
else:
cmd = ["docker", "run", "--rm", "-v", f"{browsertrix_home_host}:/crawls/", "webrecorder/browsertrix-crawler"] + cmd
if self.profile:
profile_fn = os.path.join(browsertrix_home_container, "profile.tar.gz")
logger.debug(f"copying {self.profile} to {profile_fn}")
shutil.copyfile(self.profile, profile_fn)
cmd.extend(["--profile", os.path.join("/crawls", "profile.tar.gz")])
else:
logger.debug(f"generating WACZ without Docker for {url=}") logger.debug(f"generating WACZ without Docker for {url=}")
cmd = [
"crawl",
"--url", url,
"--scopeType", "page",
"--generateWACZ",
"--text",
"--screenshot", "fullPage",
"--collection", collection,
"--id", collection,
"--saveState", "never",
"--behaviors", "autoscroll,autoplay,autofetch,siteSpecific",
"--behaviorTimeout", str(self.timeout),
"--timeout", str(self.timeout)]
if self.profile: if self.profile:
cmd.extend(["--profile", os.path.join("/app", str(self.profile))]) cmd.extend(["--profile", os.path.join("/app", str(self.profile))])
else:
logger.debug(f"generating WACZ in Docker for {url=}")
cmd = [
"docker", "run",
"--rm", # delete container once it has completed running
"-v", f"{browsertrix_home}:/crawls/",
# "-it", # this leads to "the input device is not a TTY"
"webrecorder/browsertrix-crawler", "crawl",
"--url", url,
"--scopeType", "page",
"--generateWACZ",
"--text",
"--screenshot", "fullPage",
"--collection", collection,
"--behaviors", "autoscroll,autoplay,autofetch,siteSpecific",
"--behaviorTimeout", str(self.timeout),
"--timeout", str(self.timeout)
]
if self.profile:
profile_fn = os.path.join(browsertrix_home, "profile.tar.gz")
shutil.copyfile(self.profile, profile_fn)
cmd.extend(["--profile", os.path.join("/crawls", "profile.tar.gz")])
try: try:
logger.info(f"Running browsertrix-crawler: {' '.join(cmd)}") logger.info(f"Running browsertrix-crawler: {' '.join(cmd)}")
@@ -99,18 +95,36 @@ class WaczArchiverEnricher(Enricher, Archiver):
logger.error(f"WACZ generation failed: {e}") logger.error(f"WACZ generation failed: {e}")
return False return False
if os.getenv('RUNNING_IN_DOCKER'): if use_docker:
filename = os.path.join("collections", collection, f"{collection}.wacz") wacz_fn = os.path.join(browsertrix_home_container, "collections", collection, f"{collection}.wacz")
else: else:
filename = os.path.join(browsertrix_home, "collections", collection, f"{collection}.wacz") wacz_fn = os.path.join("collections", collection, f"{collection}.wacz")
if not os.path.exists(filename): if not os.path.exists(wacz_fn):
logger.warning(f"Unable to locate and upload WACZ {filename=}") logger.warning(f"Unable to locate and upload WACZ {wacz_fn=}")
return False return False
to_enrich.add_media(Media(filename), "browsertrix") to_enrich.add_media(Media(wacz_fn), "browsertrix")
if self.extract_media: if self.extract_media:
self.extract_media_from_wacz(to_enrich, filename) self.extract_media_from_wacz(to_enrich, wacz_fn)
if use_docker:
jsonl_fn = os.path.join(browsertrix_home_container, "collections", collection, "pages", "pages.jsonl")
else:
jsonl_fn = os.path.join("collections", collection, "pages", "pages.jsonl")
if not os.path.exists(jsonl_fn):
logger.warning(f"Unable to locate and pages.jsonl {jsonl_fn=}")
else:
logger.info(f"Parsing pages.jsonl {jsonl_fn=}")
with jsonlines.open(jsonl_fn) as reader:
for obj in reader:
if 'title' in obj:
to_enrich.set_title(obj['title'])
if 'text' in obj:
to_enrich.set_content(obj['text'])
return True return True
def extract_media_from_wacz(self, to_enrich: Metadata, wacz_filename: str) -> None: def extract_media_from_wacz(self, to_enrich: Metadata, wacz_filename: str) -> None:

View File

@@ -1,6 +1,6 @@
from __future__ import annotations from __future__ import annotations
from dataclasses import dataclass from dataclasses import dataclass
import mimetypes, uuid, os, pathlib import mimetypes, os, pathlib
from jinja2 import Environment, FileSystemLoader from jinja2 import Environment, FileSystemLoader
from urllib.parse import quote from urllib.parse import quote
from loguru import logger from loguru import logger
@@ -9,6 +9,7 @@ from ..version import __version__
from ..core import Metadata, Media, ArchivingContext from ..core import Metadata, Media, ArchivingContext
from . import Formatter from . import Formatter
from ..enrichers import HashEnricher from ..enrichers import HashEnricher
from ..utils.misc import random_str
@dataclass @dataclass
@@ -44,10 +45,10 @@ class HtmlFormatter(Formatter):
metadata=item.metadata, metadata=item.metadata,
version=__version__ version=__version__
) )
html_path = os.path.join(ArchivingContext.get_tmp_dir(), f"formatted{str(uuid.uuid4())}.html") html_path = os.path.join(ArchivingContext.get_tmp_dir(), f"formatted{random_str(24)}.html")
with open(html_path, mode="w", encoding="utf-8") as outf: with open(html_path, mode="w", encoding="utf-8") as outf:
outf.write(content) outf.write(content)
final_media = Media(filename=html_path) final_media = Media(filename=html_path, _mimetype="text/html")
he = HashEnricher({"hash_enricher": {"algorithm": ArchivingContext.get("hash_enricher.algorithm"), "chunksize": 1.6e7}}) he = HashEnricher({"hash_enricher": {"algorithm": ArchivingContext.get("hash_enricher.algorithm"), "chunksize": 1.6e7}})
if len(hd := he.calculate_hash(final_media.filename)): if len(hd := he.calculate_hash(final_media.filename)):

View File

@@ -119,7 +119,7 @@ class GDriveStorage(Storage):
'parents': [upload_to] 'parents': [upload_to]
} }
media = MediaFileUpload(media.filename, resumable=True) media = MediaFileUpload(media.filename, resumable=True)
gd_file = self.service.files().create(body=file_metadata, media_body=media, fields='id').execute() gd_file = self.service.files().create(supportsAllDrives=True, body=file_metadata, media_body=media, fields='id').execute()
logger.debug(f'uploadf: uploaded file {gd_file["id"]} successfully in folder={upload_to}') logger.debug(f'uploadf: uploaded file {gd_file["id"]} successfully in folder={upload_to}')
# must be implemented even if unused # must be implemented even if unused
@@ -150,6 +150,9 @@ class GDriveStorage(Storage):
for attempt in range(retries): for attempt in range(retries):
results = self.service.files().list( results = self.service.files().list(
# both below for Google Shared Drives
supportsAllDrives=True,
includeItemsFromAllDrives=True,
q=query_string, q=query_string,
spaces='drive', # ie not appDataFolder or photos spaces='drive', # ie not appDataFolder or photos
fields='files(id, name)' fields='files(id, name)'
@@ -182,7 +185,7 @@ class GDriveStorage(Storage):
'mimeType': 'application/vnd.google-apps.folder', 'mimeType': 'application/vnd.google-apps.folder',
'parents': [parent_id] 'parents': [parent_id]
} }
gd_folder = self.service.files().create(body=file_metadata, fields='id').execute() gd_folder = self.service.files().create(supportsAllDrives=True, body=file_metadata, fields='id').execute()
return gd_folder.get('id') return gd_folder.get('id')
# def exists(self, key): # def exists(self, key):

View File

@@ -1,14 +1,14 @@
from typing import IO, Any from typing import IO
import boto3, uuid, os, mimetypes import boto3, os
from botocore.errorfactory import ClientError
from ..core import Metadata from ..utils.misc import random_str
from ..core import Media from ..core import Media
from ..storages import Storage from ..storages import Storage
from ..enrichers import HashEnricher
from loguru import logger from loguru import logger
from slugify import slugify
NO_DUPLICATES_FOLDER = "no-dups/"
class S3Storage(Storage): class S3Storage(Storage):
name = "s3_storage" name = "s3_storage"
@@ -21,6 +21,9 @@ class S3Storage(Storage):
aws_access_key_id=self.key, aws_access_key_id=self.key,
aws_secret_access_key=self.secret aws_secret_access_key=self.secret
) )
self.random_no_duplicate = bool(self.random_no_duplicate)
if self.random_no_duplicate:
logger.warning("random_no_duplicate is set to True, this will override `path_generator`, `filename_generator` and `folder`.")
@staticmethod @staticmethod
def configs() -> dict: def configs() -> dict:
@@ -31,7 +34,7 @@ class S3Storage(Storage):
"region": {"default": None, "help": "S3 region name"}, "region": {"default": None, "help": "S3 region name"},
"key": {"default": None, "help": "S3 API key"}, "key": {"default": None, "help": "S3 API key"},
"secret": {"default": None, "help": "S3 API secret"}, "secret": {"default": None, "help": "S3 API secret"},
# TODO: how to have sth like a custom folder? has to come from the feeders "random_no_duplicate": {"default": False, "help": f"if set, it will override `path_generator`, `filename_generator` and `folder`. It will check if the file already exists and if so it will not upload it again. Creates a new root folder path `{NO_DUPLICATES_FOLDER}`"},
"endpoint_url": { "endpoint_url": {
"default": 'https://{region}.digitaloceanspaces.com', "default": 'https://{region}.digitaloceanspaces.com',
"help": "S3 bucket endpoint, {region} are inserted at runtime" "help": "S3 bucket endpoint, {region} are inserted at runtime"
@@ -47,6 +50,22 @@ class S3Storage(Storage):
return self.cdn_url.format(bucket=self.bucket, region=self.region, key=media.key) return self.cdn_url.format(bucket=self.bucket, region=self.region, key=media.key)
def uploadf(self, file: IO[bytes], media: Media, **kwargs: dict) -> None: def uploadf(self, file: IO[bytes], media: Media, **kwargs: dict) -> None:
if not self.is_upload_needed(media): return True
if self.random_no_duplicate:
# checks if a folder with the hash already exists, if so it skips the upload
he = HashEnricher({"hash_enricher": {"algorithm": "SHA-256", "chunksize": 1.6e7}})
hd = he.calculate_hash(media.filename)
path = os.path.join(NO_DUPLICATES_FOLDER, hd[:24])
if existing_key:=self.file_in_folder(path):
media.key = existing_key
logger.debug(f"skipping upload of {media.filename} because it already exists in {media.key}")
return True
_, ext = os.path.splitext(media.key)
media.key = os.path.join(path, f"{random_str(24)}{ext}")
extra_args = kwargs.get("extra_args", {}) extra_args = kwargs.get("extra_args", {})
if not self.private and 'ACL' not in extra_args: if not self.private and 'ACL' not in extra_args:
extra_args['ACL'] = 'public-read' extra_args['ACL'] = 'public-read'
@@ -60,14 +79,30 @@ class S3Storage(Storage):
self.s3.upload_fileobj(file, Bucket=self.bucket, Key=media.key, ExtraArgs=extra_args) self.s3.upload_fileobj(file, Bucket=self.bucket, Key=media.key, ExtraArgs=extra_args)
return True return True
def is_upload_needed(self, media: Media) -> bool:
if self.random_no_duplicate:
# checks if a folder with the hash already exists, if so it skips the upload
he = HashEnricher({"hash_enricher": {"algorithm": "SHA-256", "chunksize": 1.6e7}})
hd = he.calculate_hash(media.filename)
path = os.path.join(NO_DUPLICATES_FOLDER, hd[:24])
if existing_key:=self.file_in_folder(path):
media.key = existing_key
logger.debug(f"skipping upload of {media.filename} because it already exists in {media.key}")
return False
_, ext = os.path.splitext(media.key)
media.key = os.path.join(path, f"{random_str(24)}{ext}")
return True
def file_in_folder(self, path:str) -> str:
# checks if path exists and is not an empty folder
if not path.endswith('/'):
path = path + '/'
resp = self.s3.list_objects(Bucket=self.bucket, Prefix=path, Delimiter='/', MaxKeys=1)
if 'Contents' in resp:
return resp['Contents'][0]['Key']
return False
# def exists(self, key: str) -> bool:
# """
# Tests if a given file with key=key exists in the bucket
# """
# try:
# self.s3.head_object(Bucket=self.bucket, Key=key)
# return True
# except ClientError as e:
# logger.warning(f"got a ClientError when checking if {key=} exists in bucket={self.bucket}: {e}")
# return False

View File

@@ -2,11 +2,13 @@ from __future__ import annotations
from abc import abstractmethod from abc import abstractmethod
from dataclasses import dataclass from dataclasses import dataclass
from typing import IO from typing import IO
import os
from ..utils.misc import random_str
from ..core import Media, Step, ArchivingContext from ..core import Media, Step, ArchivingContext
from ..enrichers import HashEnricher from ..enrichers import HashEnricher
from loguru import logger from loguru import logger
import os, uuid
from slugify import slugify from slugify import slugify
@@ -72,10 +74,10 @@ class Storage(Step):
filename = slugify(filename) # in case it comes with os.sep filename = slugify(filename) # in case it comes with os.sep
elif self.path_generator == "url": path = slugify(url) elif self.path_generator == "url": path = slugify(url)
elif self.path_generator == "random": elif self.path_generator == "random":
path = ArchivingContext.get("random_path", str(uuid.uuid4())[:16], True) path = ArchivingContext.get("random_path", random_str(24), True)
# filename_generator logic # filename_generator logic
if self.filename_generator == "random": filename = str(uuid.uuid4())[:16] if self.filename_generator == "random": filename = random_str(24)
elif self.filename_generator == "static": elif self.filename_generator == "static":
he = HashEnricher({"hash_enricher": {"algorithm": ArchivingContext.get("hash_enricher.algorithm"), "chunksize": 1.6e7}}) he = HashEnricher({"hash_enricher": {"algorithm": ArchivingContext.get("hash_enricher.algorithm"), "chunksize": 1.6e7}})
hd = he.calculate_hash(media.filename) hd = he.calculate_hash(media.filename)

View File

@@ -1,5 +1,6 @@
import os, json, requests import os, json, requests
import uuid
from datetime import datetime from datetime import datetime
from loguru import logger from loguru import logger
@@ -49,3 +50,7 @@ def update_nested_dict(dictionary, update_dict):
update_nested_dict(dictionary[key], value) update_nested_dict(dictionary[key], value)
else: else:
dictionary[key] = value dictionary[key] = value
def random_str(length: int = 32) -> str:
assert length <= 32, "length must be less than 32 as UUID4 is used"
return str(uuid.uuid4()).replace("-", "")[:length]

View File

@@ -65,6 +65,9 @@ class UrlUtil:
if "vk.com/images/" in url: return False if "vk.com/images/" in url: return False
if "vk.com/images/reaction/" in url: return False if "vk.com/images/reaction/" in url: return False
# wikipedia
if "wikipedia.org/static" in url: return False
return True return True
@staticmethod @staticmethod

View File

@@ -15,7 +15,7 @@ class Webdriver:
def __enter__(self) -> webdriver: def __enter__(self) -> webdriver:
options = webdriver.FirefoxOptions() options = webdriver.FirefoxOptions()
options.headless = True options.add_argument("--headless")
options.set_preference('network.protocol-handler.external.tg', False) options.set_preference('network.protocol-handler.external.tg', False)
try: try:
self.driver = webdriver.Firefox(options=options) self.driver = webdriver.Firefox(options=options)

View File

@@ -1,9 +1,9 @@
_MAJOR = "0" _MAJOR = "0"
_MINOR = "6" _MINOR = "7"
# On main and in a nightly release the patch should be one ahead of the last # On main and in a nightly release the patch should be one ahead of the last
# released build. # released build.
_PATCH = "6" _PATCH = "7"
# This is mainly for nightly builds which have the suffix ".dev$DATE". See # This is mainly for nightly builds which have the suffix ".dev$DATE". See
# https://semver.org/#is-v123-a-semantic-version for the semantics. # https://semver.org/#is-v123-a-semantic-version for the semantics.
_SUFFIX = "" _SUFFIX = ""