Compare commits

...

42 Commits

Author SHA1 Message Date
msramalho
b9b831ce03 v8.0.1 2024-02-01 15:08:55 +00:00
msramalho
2a773a25e8 better handling of telethon data display 2024-02-01 15:08:23 +00:00
msramalho
719645fc2d minor improvement to html_template 2024-02-01 15:03:00 +00:00
Chu-An, Huang
71fcf5a089 fix: Correct the path of service account in google drive settings (#123)
* fix: Correct the path of service account in yaml file

* fix: Remove redefined function

* Update src/auto_archiver/storages/gd.py

* fix: remove unwanted drafting code

---------

Co-authored-by: Miguel Sozinho Ramalho <19508417+msramalho@users.noreply.github.com>
2024-02-01 15:02:04 +00:00
Tomas Apodaca
590d3fe824 Fix typo in readme (#121) 2024-01-24 21:17:31 +00:00
Miguel Sozinho Ramalho
e6b6b83007 0.8.0 new features and dependency updates (#119)
* wacz can extract_screenshot only

* new meta enricher

* twitter api can use multiple authentication tokens in sequence

* cleanup non-dup logic

* meta info on archive duration

* minor html report update

* updated dependencies

* new version
2023-12-20 14:13:22 +00:00
msramalho
499832d146 fix datetime parsing 2023-12-13 18:41:48 +00:00
msramalho
fa1163532b patching now optional value 2023-12-13 13:55:31 +00:00
msramalho
96f6ea8f09 v0.7.8 2023-12-13 13:03:39 +00:00
Miguel Sozinho Ramalho
ff17dfd0aa enables option to toggle db api writes (#118) 2023-12-13 12:54:47 +00:00
msramalho
0a3053bbc7 version update 2023-12-13 11:29:13 +00:00
Miguel Sozinho Ramalho
e69660be82 chooses most complete result from api (#117) 2023-12-13 11:28:27 +00:00
Miguel Sozinho Ramalho
a786d4bb0e chooses most complete result from api (#116) 2023-12-13 11:26:46 +00:00
Miguel Sozinho Ramalho
128d4136e3 fixes empty api search results (#115) 2023-12-13 10:51:25 +00:00
Miguel Sozinho Ramalho
98fb574d89 fixing older db entries formats (#114) 2023-12-12 22:47:54 +00:00
Miguel Sozinho Ramalho
6f36e92e02 enables api_db cache queries if configured with new option (#113) 2023-12-12 19:20:26 +00:00
Miguel Sozinho Ramalho
3e56ef137d reduce s3 duplicating while keeping random urls via hash (#112) 2023-12-12 19:12:03 +00:00
Jett Chen
9ee323a654 Set _mimetype for final media of html formatter (#111) 2023-12-11 11:47:04 +00:00
Kai
9eb39943c7 Extract text in wacz_enricher (#110) 2023-12-05 22:24:12 +00:00
msramalho
8624e9f177 version update 0.7.1 2023-11-13 11:58:43 +01:00
Galen Reich
381940f5a8 Fix Selenium headless invokation (#106)
Co-authored-by: msramalho <19508417+msramalho@users.noreply.github.com>
2023-11-13 11:56:35 +01:00
msramalho
1382f8b795 version bump and release without commit 2023-09-22 10:18:58 +01:00
Dave Mateer
fac8364762 Updated gd.py to work with shared folders (#102)
Co-authored-by: msramalho <19508417+msramalho@users.noreply.github.com>
2023-09-22 10:17:54 +01:00
msramalho
0feeb0bd24 Bump version to v0.6.12 for release 2023-09-20 10:18:44 +01:00
msramalho
ddb9dc87d7 unfortunately needed twitter->x 2023-09-20 10:17:31 +01:00
msramalho
e8935b9a80 Bump version to v0.6.11 for release 2023-09-15 19:53:07 +01:00
msramalho
b157f9a6b1 renaming variable 2023-09-15 19:52:47 +01:00
msramalho
ea38a604bb fixes #96 by not assigning to self.prop 2023-09-15 19:35:35 +01:00
msramalho
53494c961e Bump version to v0.6.10 for release 2023-09-14 17:50:08 +01:00
Kai
f7839a99cc Add configs for path to write and read wacz archives (#93)
Co-authored-by: msramalho <19508417+msramalho@users.noreply.github.com>
2023-09-14 17:49:37 +01:00
msramalho
7a2119e6e9 Bump version to v0.6.9 for release 2023-09-12 20:08:00 +01:00
Miguel Sozinho Ramalho
3ae25e51e7 adds flexibile setup for wacz in docker (#94) 2023-09-12 20:07:21 +01:00
msramalho
9584193d69 Bump version to v0.6.8 for release 2023-09-08 15:10:02 +01:00
msramalho
0dd45d90f1 fix: docker+wacz troubles 2023-09-08 15:09:50 +01:00
msramalho
edcb2da74a Bump version to v0.6.7 for release 2023-09-06 17:07:14 +01:00
msramalho
17d9bf694f fix docker image so as not to remove browsertrix files 2023-09-06 17:07:10 +01:00
Miguel Sozinho Ramalho
368395ffa8 Merge pull request #88 from djhmateer/v6-test 2023-08-28 11:09:28 +01:00
Miguel Sozinho Ramalho
21d7d2e16c format youtubedl_archiver.py 2023-08-28 11:09:03 +01:00
Dave Mateer
0bbb4c9b08 Added noplaylist true to youtubedl so that videos in playlists will work 2023-08-27 17:26:36 +01:00
msramalho
a30607801f Bump version to v0.6.6 for release 2023-08-24 17:10:16 +01:00
Miguel Sozinho Ramalho
c75d54a4ec Merge pull request #87 from bellingcat/fix-wacz 2023-08-24 17:09:49 +01:00
msramalho
804fcb1204 browsertrix dependencies isolated into dockerfile 2023-08-24 16:57:58 +01:00
32 changed files with 1479 additions and 1562 deletions

View File

@@ -4,7 +4,6 @@ ENV RUNNING_IN_DOCKER=1
WORKDIR /app WORKDIR /app
# TODO: use custom ffmpeg builds instead of apt-get install
RUN pip install --upgrade pip && \ RUN pip install --upgrade pip && \
pip install pipenv && \ pip install pipenv && \
add-apt-repository ppa:mozillateam/ppa && \ add-apt-repository ppa:mozillateam/ppa && \
@@ -18,19 +17,15 @@ RUN pip install --upgrade pip && \
rm geckodriver-v* rm geckodriver-v*
# TODO: avoid copying unnecessary files, including .git
COPY Pipfile* ./ COPY Pipfile* ./
RUN pipenv install # install from pipenv, with browsertrix-only requirements
RUN pipenv install && \
pipenv install pywb uwsgi
# doing this at the end helps during development, builds are quick # doing this at the end helps during development, builds are quick
COPY ./src/ . COPY ./src/ .
# TODO: figure out how to make volumes not be root, does it depend on host or dockerfile?
# RUN useradd --system --groups sudo --shell /bin/bash archiver && chown -R archiver:sudo .
# USER archiver
ENTRYPOINT ["pipenv", "run", "python3", "-m", "auto_archiver"] ENTRYPOINT ["pipenv", "run", "python3", "-m", "auto_archiver"]
# should be executed with 2 volumes (3 if local_storage is used) # should be executed with 2 volumes (3 if local_storage is used)
# docker run --rm -v $PWD/secrets:/app/secrets -v $PWD/local_archive:/app/local_archive aa pipenv run python3 -m auto_archiver --config secrets/orchestration.yaml # docker run --rm -v $PWD/secrets:/app/secrets -v $PWD/local_archive:/app/local_archive aa pipenv run python3 -m auto_archiver --config secrets/orchestration.yaml

11
Pipfile
View File

@@ -35,14 +35,11 @@ vk-url-scraper = "*"
requests = {extras = ["socks"], version = "*"} requests = {extras = ["socks"], version = "*"}
numpy = "*" numpy = "*"
warcio = "*" warcio = "*"
# pywb and uwsgi are needed for browsertrix to run in docker jsonlines = "*"
# wacz = "==0.4.8"
uwsgi = "*"
pywb = "*"
[requires]
python_version = "3.10"
[dev-packages] [dev-packages]
autopep8 = "*" autopep8 = "*"
setuptools-pipfile = "*" setuptools-pipfile = "*"
[requires]
python_version = "3.10"

2574
Pipfile.lock generated

File diff suppressed because it is too large Load Diff

View File

@@ -179,7 +179,7 @@ The first time you run, you will be prompted to do a authentication with the pho
## Running on Google Sheets Feeder (gsheet_feeder) ## Running on Google Sheets Feeder (gsheet_feeder)
The `--gseets_feeder.sheet` property is the name of the Google Sheet to check for URLs. The `--gsheet_feeder.sheet` property is the name of the Google Sheet to check for URLs.
This sheet must have been shared with the Google Service account used by `gspread`. This sheet must have been shared with the Google Service account used by `gspread`.
This sheet must also have specific columns (case-insensitive) in the `header` as specified in [Gsheet.configs](src/auto_archiver/utils/gsheet.py). The default names of these columns and their purpose is: This sheet must also have specific columns (case-insensitive) in the `header` as specified in [Gsheet.configs](src/auto_archiver/utils/gsheet.py). The default names of these columns and their purpose is:

View File

@@ -8,8 +8,8 @@ TAG=$(python -c 'from src.auto_archiver.version import __version__; print("v" +
read -p "Creating new release for $TAG. Do you want to continue? [Y/n] " prompt read -p "Creating new release for $TAG. Do you want to continue? [Y/n] " prompt
if [[ $prompt == "y" || $prompt == "Y" || $prompt == "yes" || $prompt == "Yes" ]]; then if [[ $prompt == "y" || $prompt == "Y" || $prompt == "yes" || $prompt == "Yes" ]]; then
git add -A # git add -A
git commit -m "Bump version to $TAG for release" || true && git push # git commit -m "Bump version to $TAG for release" || true && git push
echo "Creating new git tag $TAG" echo "Creating new git tag $TAG"
git tag "$TAG" -m "$TAG" git tag "$TAG" -m "$TAG"
git push --tags git push --tags

View File

@@ -146,8 +146,10 @@ class TelethonArchiver(Archiver):
logger.debug(f"Empty media found, skipping {str(mp)=}") logger.debug(f"Empty media found, skipping {str(mp)=}")
continue continue
result.add_media(Media(filename)) result.add_media(Media(filename))
result.set_content(str(post)).set_title(title).set_timestamp(post.date) result.set_title(title).set_timestamp(post.date).set("api_data", post.to_dict())
if post.message != title:
result.set_content(post.message)
return result.success("telethon") return result.success("telethon")
def _get_media_posts_in_group(self, chat, original_post, max_amp=10): def _get_media_posts_in_group(self, chat, original_post, max_amp=10):

View File

@@ -1,9 +1,11 @@
import json, os, traceback, uuid import json, os, traceback
import tiktok_downloader import tiktok_downloader
from loguru import logger from loguru import logger
from . import Archiver from . import Archiver
from ..core import Metadata, Media, ArchivingContext from ..core import Metadata, Media, ArchivingContext
from ..utils.misc import random_str
class TiktokArchiver(Archiver): class TiktokArchiver(Archiver):
@@ -37,7 +39,7 @@ class TiktokArchiver(Archiver):
logger.warning(f'Other Tiktok error {error}') logger.warning(f'Other Tiktok error {error}')
try: try:
filename = os.path.join(ArchivingContext.get_tmp_dir(), f'{str(uuid.uuid4())[0:8]}.mp4') filename = os.path.join(ArchivingContext.get_tmp_dir(), f'{random_str(8)}.mp4')
tiktok_media = tiktok_downloader.snaptik(url).get_media() tiktok_media = tiktok_downloader.snaptik(url).get_media()
if len(tiktok_media) <= 0: if len(tiktok_media) <= 0:

View File

@@ -16,36 +16,55 @@ class TwitterApiArchiver(TwitterArchiver, Archiver):
def __init__(self, config: dict) -> None: def __init__(self, config: dict) -> None:
super().__init__(config) super().__init__(config)
self.api_index = 0
self.apis = []
if len(self.bearer_tokens):
self.apis.extend([Api(bearer_token=bearer_token) for bearer_token in self.bearer_tokens])
if self.bearer_token: if self.bearer_token:
self.assert_valid_string("bearer_token") self.assert_valid_string("bearer_token")
self.api = Api(bearer_token=self.bearer_token) self.apis.append(Api(bearer_token=self.bearer_token))
elif self.consumer_key and self.consumer_secret and self.access_token and self.access_secret: if self.consumer_key and self.consumer_secret and self.access_token and self.access_secret:
self.assert_valid_string("consumer_key") self.assert_valid_string("consumer_key")
self.assert_valid_string("consumer_secret") self.assert_valid_string("consumer_secret")
self.assert_valid_string("access_token") self.assert_valid_string("access_token")
self.assert_valid_string("access_secret") self.assert_valid_string("access_secret")
self.api = Api( self.apis.append(Api(consumer_key=self.consumer_key, consumer_secret=self.consumer_secret,
consumer_key=self.consumer_key, consumer_secret=self.consumer_secret, access_token=self.access_token, access_secret=self.access_secret) access_token=self.access_token, access_secret=self.access_secret))
assert hasattr(self, "api") and self.api is not None, "Missing Twitter API configurations, please provide either bearer_token OR (consumer_key, consumer_secret, access_token, access_secret) to use this archiver." assert self.api_client is not None, "Missing Twitter API configurations, please provide either AND/OR (consumer_key, consumer_secret, access_token, access_secret) to use this archiver, you can provide both for better rate-limit results."
@staticmethod @staticmethod
def configs() -> dict: def configs() -> dict:
return { return {
"bearer_token": {"default": None, "help": "twitter API bearer_token which is enough for archiving, if not provided you will need consumer_key, consumer_secret, access_token, access_secret"}, "bearer_token": {"default": None, "help": "[deprecated: see bearer_tokens] twitter API bearer_token which is enough for archiving, if not provided you will need consumer_key, consumer_secret, access_token, access_secret"},
"bearer_tokens": {"default": [], "help": " a list of twitter API bearer_token which is enough for archiving, if not provided you will need consumer_key, consumer_secret, access_token, access_secret, if provided you can still add those for better rate limits. CSV of bearer tokens if provided via the command line", "cli_set": lambda cli_val, cur_val: list(set(cli_val.split(",")))},
"consumer_key": {"default": None, "help": "twitter API consumer_key"}, "consumer_key": {"default": None, "help": "twitter API consumer_key"},
"consumer_secret": {"default": None, "help": "twitter API consumer_secret"}, "consumer_secret": {"default": None, "help": "twitter API consumer_secret"},
"access_token": {"default": None, "help": "twitter API access_token"}, "access_token": {"default": None, "help": "twitter API access_token"},
"access_secret": {"default": None, "help": "twitter API access_secret"}, "access_secret": {"default": None, "help": "twitter API access_secret"},
} }
@property # getter .mimetype
def api_client(self) -> str:
return self.apis[self.api_index]
def download(self, item: Metadata) -> Metadata: def download(self, item: Metadata) -> Metadata:
# call download retry until success or no more apis
while self.api_index < len(self.apis):
if res := self.download_retry(item): return res
self.api_index += 1
self.api_index = 0
return False
def download_retry(self, item: Metadata) -> Metadata:
url = item.get_url() url = item.get_url()
# detect URLs that we definitely cannot handle # detect URLs that we definitely cannot handle
username, tweet_id = self.get_username_tweet_id(url) username, tweet_id = self.get_username_tweet_id(url)
if not username: return False if not username: return False
try: try:
tweet = self.api.get_tweet(tweet_id, expansions=["attachments.media_keys"], media_fields=["type", "duration_ms", "url", "variants"], tweet_fields=["attachments", "author_id", "created_at", "entities", "id", "text", "possibly_sensitive"]) tweet = self.api_client.get_tweet(tweet_id, expansions=["attachments.media_keys"], media_fields=["type", "duration_ms", "url", "variants"], tweet_fields=["attachments", "author_id", "created_at", "entities", "id", "text", "possibly_sensitive"])
logger.debug(tweet)
except Exception as e: except Exception as e:
logger.error(f"Could not get tweet: {e}") logger.error(f"Could not get tweet: {e}")
return False return False

View File

@@ -15,8 +15,8 @@ class TwitterArchiver(Archiver):
""" """
name = "twitter_archiver" name = "twitter_archiver"
link_pattern = re.compile(r"twitter.com\/(?:\#!\/)?(\w+)\/status(?:es)?\/(\d+)") link_pattern = re.compile(r"(?:twitter|x).com\/(?:\#!\/)?(\w+)\/status(?:es)?\/(\d+)")
link_clean_pattern = re.compile(r"(.+twitter\.com\/.+\/\d+)(\?)*.*") link_clean_pattern = re.compile(r"(.+(?:twitter|x)\.com\/.+\/\d+)(\?)*.*")
def __init__(self, config: dict) -> None: def __init__(self, config: dict) -> None:
super().__init__(config) super().__init__(config)

View File

@@ -25,7 +25,7 @@ class YoutubeDLArchiver(Archiver):
logger.debug('Using Facebook cookie') logger.debug('Using Facebook cookie')
yt_dlp.utils.std_headers['cookie'] = self.facebook_cookie yt_dlp.utils.std_headers['cookie'] = self.facebook_cookie
ydl = yt_dlp.YoutubeDL({'outtmpl': os.path.join(ArchivingContext.get_tmp_dir(), f'%(id)s.%(ext)s'), 'quiet': False}) ydl = yt_dlp.YoutubeDL({'outtmpl': os.path.join(ArchivingContext.get_tmp_dir(), f'%(id)s.%(ext)s'), 'quiet': False, 'noplaylist': True})
try: try:
# don'd download since it can be a live stream # don'd download since it can be a live stream

View File

@@ -65,7 +65,9 @@ class Media:
@property # getter .mimetype @property # getter .mimetype
def mimetype(self) -> str: def mimetype(self) -> str:
assert self.filename is not None and len(self.filename) > 0, "cannot get mimetype from media without filename" if not self.filename or len(self.filename) == 0:
logger.warning(f"cannot get mimetype from media without filename: {self}")
return ""
if not self._mimetype: if not self._mimetype:
self._mimetype = mimetypes.guess_type(self.filename)[0] self._mimetype = mimetypes.guess_type(self.filename)[0]
return self._mimetype or "" return self._mimetype or ""

View File

@@ -7,6 +7,8 @@ from dataclasses_json import dataclass_json, config
import datetime import datetime
from urllib.parse import urlparse from urllib.parse import urlparse
from dateutil.parser import parse as parse_dt from dateutil.parser import parse as parse_dt
from loguru import logger
from .media import Media from .media import Media
from .context import ArchivingContext from .context import ArchivingContext
@@ -105,10 +107,16 @@ class Metadata:
def get_timestamp(self, utc=True, iso=True) -> datetime.datetime: def get_timestamp(self, utc=True, iso=True) -> datetime.datetime:
ts = self.get("timestamp") ts = self.get("timestamp")
if not ts: return ts if not ts: return
if utc: ts = ts.replace(tzinfo=datetime.timezone.utc) try:
if iso: return ts.isoformat() if type(ts) == str: ts = datetime.datetime.fromisoformat(ts)
return ts if type(ts) == float: ts = datetime.datetime.fromtimestamp(ts)
if utc: ts = ts.replace(tzinfo=datetime.timezone.utc)
if iso: return ts.isoformat()
return ts
except Exception as e:
logger.error(f"Unable to parse timestamp {ts}: {e}")
return
def add_media(self, media: Media, id: str = None) -> Metadata: def add_media(self, media: Media, id: str = None) -> Metadata:
# adds a new media, optionally including an id # adds a new media, optionally including an id
@@ -164,3 +172,16 @@ class Metadata:
def __str__(self) -> str: def __str__(self) -> str:
return self.__repr__() return self.__repr__()
@staticmethod
def choose_most_complete(results: List[Metadata]) -> Metadata:
# returns the most complete result from a list of results
# prioritizes results with more media, then more metadata
if len(results) == 0: return None
if len(results) == 1: return results[0]
most_complete = results[0]
for r in results[1:]:
if len(r.media) > len(most_complete.media): most_complete = r
elif len(r.media) == len(most_complete.media) and len(r.metadata) > len(most_complete.metadata): most_complete = r
return most_complete

View File

@@ -77,7 +77,7 @@ class ArchivingOrchestrator:
if cached_result: if cached_result:
logger.debug("Found previously archived entry") logger.debug("Found previously archived entry")
for d in self.databases: for d in self.databases:
d.done(cached_result) d.done(cached_result, cached=True)
return cached_result return cached_result
# 3 - call archivers until one succeeds # 3 - call archivers until one succeeds

View File

@@ -1,3 +1,4 @@
from typing import Union
import requests, os import requests, os
from loguru import logger from loguru import logger
@@ -14,23 +15,48 @@ class AAApiDb(Database):
def __init__(self, config: dict) -> None: def __init__(self, config: dict) -> None:
# without this STEP.__init__ is not called # without this STEP.__init__ is not called
super().__init__(config) super().__init__(config)
self.allow_rearchive = bool(self.allow_rearchive)
self.store_results = bool(self.store_results)
self.assert_valid_string("api_endpoint") self.assert_valid_string("api_endpoint")
self.assert_valid_string("api_secret")
@staticmethod @staticmethod
def configs() -> dict: def configs() -> dict:
return { return {
"api_endpoint": {"default": None, "help": "API endpoint where calls are made to"}, "api_endpoint": {"default": None, "help": "API endpoint where calls are made to"},
"api_secret": {"default": None, "help": "API authentication secret"}, "api_secret": {"default": None, "help": "API Basic authentication secret [deprecating soon]"},
"api_token": {"default": None, "help": "API Bearer token, to be preferred over secret (Basic auth) going forward"},
"public": {"default": False, "help": "whether the URL should be publicly available via the API"}, "public": {"default": False, "help": "whether the URL should be publicly available via the API"},
"author_id": {"default": None, "help": "which email to assign as author"}, "author_id": {"default": None, "help": "which email to assign as author"},
"group_id": {"default": None, "help": "which group of users have access to the archive in case public=false as author"}, "group_id": {"default": None, "help": "which group of users have access to the archive in case public=false as author"},
"allow_rearchive": {"default": True, "help": "if False then the API database will be queried prior to any archiving operations and stop if the link has already been archived"},
"store_results": {"default": True, "help": "when set, will send the results to the API database."},
"tags": {"default": [], "help": "what tags to add to the archived URL", "cli_set": lambda cli_val, cur_val: set(cli_val.split(","))}, "tags": {"default": [], "help": "what tags to add to the archived URL", "cli_set": lambda cli_val, cur_val: set(cli_val.split(","))},
} }
def fetch(self, item: Metadata) -> Union[Metadata, bool]:
""" query the database for the existence of this item"""
if not self.allow_rearchive: return
params = {"url": item.get_url(), "limit": 15}
headers = {"Authorization": f"Bearer {self.api_token}", "accept": "application/json"}
response = requests.get(os.path.join(self.api_endpoint, "tasks/search-url"), params=params, headers=headers)
def done(self, item: Metadata) -> None: if response.status_code == 200:
if len(response.json()):
logger.success(f"API returned {len(response.json())} previously archived instance(s)")
fetched_metadata = [Metadata.from_dict(r["result"]) for r in response.json()]
return Metadata.choose_most_complete(fetched_metadata)
else:
logger.error(f"AA API FAIL ({response.status_code}): {response.json()}")
return False
def done(self, item: Metadata, cached: bool=False) -> None:
"""archival result ready - should be saved to DB""" """archival result ready - should be saved to DB"""
logger.info(f"saving archive of {item.get_url()} to the AA API.") if not self.store_results: return
if cached:
logger.debug(f"skipping saving archive of {item.get_url()} to the AA API because it was cached")
return
logger.debug(f"saving archive of {item.get_url()} to the AA API.")
payload = {'result': item.to_json(), 'public': self.public, 'author_id': self.author_id, 'group_id': self.group_id, 'tags': list(self.tags)} payload = {'result': item.to_json(), 'public': self.public, 'author_id': self.author_id, 'group_id': self.group_id, 'tags': list(self.tags)}
response = requests.post(os.path.join(self.api_endpoint, "submit-archive"), json=payload, auth=("abc", self.api_secret)) response = requests.post(os.path.join(self.api_endpoint, "submit-archive"), json=payload, auth=("abc", self.api_secret))
@@ -39,3 +65,5 @@ class AAApiDb(Database):
logger.success(f"AA API: {response.json()}") logger.success(f"AA API: {response.json()}")
else: else:
logger.error(f"AA API FAIL ({response.status_code}): {response.json()}") logger.error(f"AA API FAIL ({response.status_code}): {response.json()}")

View File

@@ -27,6 +27,6 @@ class ConsoleDb(Database):
def aborted(self, item: Metadata) -> None: def aborted(self, item: Metadata) -> None:
logger.warning(f"ABORTED {item}") logger.warning(f"ABORTED {item}")
def done(self, item: Metadata) -> None: def done(self, item: Metadata, cached: bool=False) -> None:
"""archival result ready - should be saved to DB""" """archival result ready - should be saved to DB"""
logger.success(f"DONE {item}") logger.success(f"DONE {item}")

View File

@@ -24,7 +24,7 @@ class CSVDb(Database):
"csv_file": {"default": "db.csv", "help": "CSV file name"} "csv_file": {"default": "db.csv", "help": "CSV file name"}
} }
def done(self, item: Metadata) -> None: def done(self, item: Metadata, cached: bool=False) -> None:
"""archival result ready - should be saved to DB""" """archival result ready - should be saved to DB"""
logger.success(f"DONE {item}") logger.success(f"DONE {item}")
is_empty = not os.path.isfile(self.csv_file) or os.path.getsize(self.csv_file) == 0 is_empty = not os.path.isfile(self.csv_file) or os.path.getsize(self.csv_file) == 0

View File

@@ -36,6 +36,6 @@ class Database(Step, ABC):
return False return False
@abstractmethod @abstractmethod
def done(self, item: Metadata) -> None: def done(self, item: Metadata, cached: bool=False) -> None:
"""archival result ready - should be saved to DB""" """archival result ready - should be saved to DB"""
pass pass

View File

@@ -41,7 +41,7 @@ class GsheetsDb(Database):
"""check if the given item has been archived already""" """check if the given item has been archived already"""
return False return False
def done(self, item: Metadata) -> None: def done(self, item: Metadata, cached: bool=False) -> None:
"""archival result ready - should be saved to DB""" """archival result ready - should be saved to DB"""
logger.success(f"DONE {item.get_url()}") logger.success(f"DONE {item.get_url()}")
gw, row = self._retrieve_gsheet(item) gw, row = self._retrieve_gsheet(item)
@@ -57,8 +57,10 @@ class GsheetsDb(Database):
cell_updates.append((row, col, final_value)) cell_updates.append((row, col, final_value))
except Exception as e: except Exception as e:
logger.error(f"Unable to batch {col}={final_value} due to {e}") logger.error(f"Unable to batch {col}={final_value} due to {e}")
status_message = item.status
cell_updates.append((row, 'status', item.status)) if cached:
status_message = f"[cached] {status_message}"
cell_updates.append((row, 'status', status_message))
media: Media = item.get_final_media() media: Media = item.get_final_media()
if hasattr(media, "urls"): if hasattr(media, "urls"):

View File

@@ -6,4 +6,5 @@ from .thumbnail_enricher import ThumbnailEnricher
from .wacz_enricher import WaczArchiverEnricher from .wacz_enricher import WaczArchiverEnricher
from .whisper_enricher import WhisperEnricher from .whisper_enricher import WhisperEnricher
from .pdq_hash_enricher import PdqHashEnricher from .pdq_hash_enricher import PdqHashEnricher
from .metadata_enricher import MetadataEnricher from .metadata_enricher import MetadataEnricher
from .meta_enricher import MetaEnricher

View File

@@ -0,0 +1,55 @@
import datetime
import os
from loguru import logger
from . import Enricher
from ..core import Metadata
class MetaEnricher(Enricher):
"""
Adds metadata information about the archive operations, to be included at the end of all enrichments
"""
name = "meta_enricher"
def __init__(self, config: dict) -> None:
# without this STEP.__init__ is not called
super().__init__(config)
@staticmethod
def configs() -> dict:
return {
}
def enrich(self, to_enrich: Metadata) -> None:
logger.debug(f"calculating archive metadata information for url={to_enrich.get_url()}")
self.enrich_file_sizes(to_enrich)
self.enrich_archive_duration(to_enrich)
def enrich_file_sizes(self, to_enrich):
logger.debug(f"calculating archive file sizes for url={to_enrich.get_url()} ({len(to_enrich.media)} media files)")
total_size = 0
for i, m in enumerate(to_enrich.media):
file_stats = os.stat(m.filename)
to_enrich.media[i].set("bytes", file_stats.st_size)
to_enrich.media[i].set("size", self.human_readable_bytes(file_stats.st_size))
total_size += file_stats.st_size
to_enrich.set("total_bytes", total_size)
to_enrich.set("total_size", self.human_readable_bytes(total_size))
def human_readable_bytes(self, size: int) -> str:
# receives number of bytes and returns human readble size
for unit in ["bytes", "KB", "MB", "GB", "TB"]:
if size < 1024:
return f"{size:.1f} {unit}"
size /= 1024
def enrich_archive_duration(self, to_enrich):
logger.debug(f"calculating archive duration for url={to_enrich.get_url()} ")
archive_duration = datetime.datetime.utcnow() - to_enrich.get("_processed_at")
to_enrich.set("archive_duration_seconds", archive_duration.seconds)

View File

@@ -1,9 +1,10 @@
from loguru import logger from loguru import logger
import time, uuid, os import time, os
from selenium.common.exceptions import TimeoutException from selenium.common.exceptions import TimeoutException
from . import Enricher from . import Enricher
from ..utils import Webdriver, UrlUtil from ..utils import Webdriver, UrlUtil, random_str
from ..core import Media, Metadata, ArchivingContext from ..core import Media, Metadata, ArchivingContext
class ScreenshotEnricher(Enricher): class ScreenshotEnricher(Enricher):
@@ -29,7 +30,7 @@ class ScreenshotEnricher(Enricher):
try: try:
driver.get(url) driver.get(url)
time.sleep(int(self.sleep_before_screenshot)) time.sleep(int(self.sleep_before_screenshot))
screenshot_file = os.path.join(ArchivingContext.get_tmp_dir(), f"screenshot_{str(uuid.uuid4())[0:8]}.png") screenshot_file = os.path.join(ArchivingContext.get_tmp_dir(), f"screenshot_{random_str(8)}.png")
driver.save_screenshot(screenshot_file) driver.save_screenshot(screenshot_file)
to_enrich.add_media(Media(filename=screenshot_file), id="screenshot") to_enrich.add_media(Media(filename=screenshot_file), id="screenshot")
except TimeoutException: except TimeoutException:

View File

@@ -1,8 +1,9 @@
import ffmpeg, os, uuid import ffmpeg, os
from loguru import logger from loguru import logger
from . import Enricher from . import Enricher
from ..core import Media, Metadata, ArchivingContext from ..core import Media, Metadata, ArchivingContext
from ..utils.misc import random_str
class ThumbnailEnricher(Enricher): class ThumbnailEnricher(Enricher):
@@ -23,7 +24,7 @@ class ThumbnailEnricher(Enricher):
logger.debug(f"generating thumbnails") logger.debug(f"generating thumbnails")
for i, m in enumerate(to_enrich.media[::]): for i, m in enumerate(to_enrich.media[::]):
if m.is_video(): if m.is_video():
folder = os.path.join(ArchivingContext.get_tmp_dir(), str(uuid.uuid4())) folder = os.path.join(ArchivingContext.get_tmp_dir(), random_str(24))
os.makedirs(folder, exist_ok=True) os.makedirs(folder, exist_ok=True)
logger.debug(f"generating thumbnails for {m.filename}") logger.debug(f"generating thumbnails for {m.filename}")
fps, duration = 0.5, m.get("duration") fps, duration = 0.5, m.get("duration")

View File

@@ -1,5 +1,6 @@
import jsonlines
import mimetypes import mimetypes
import os, shutil, subprocess, uuid import os, shutil, subprocess
from zipfile import ZipFile from zipfile import ZipFile
from loguru import logger from loguru import logger
from warcio.archiveiterator import ArchiveIterator from warcio.archiveiterator import ArchiveIterator
@@ -7,7 +8,7 @@ from warcio.archiveiterator import ArchiveIterator
from ..core import Media, Metadata, ArchivingContext from ..core import Media, Metadata, ArchivingContext
from . import Enricher from . import Enricher
from ..archivers import Archiver from ..archivers import Archiver
from ..utils import UrlUtil from ..utils import UrlUtil, random_str
class WaczArchiverEnricher(Enricher, Archiver): class WaczArchiverEnricher(Enricher, Archiver):
@@ -27,8 +28,10 @@ class WaczArchiverEnricher(Enricher, Archiver):
def configs() -> dict: def configs() -> dict:
return { return {
"profile": {"default": None, "help": "browsertrix-profile (for profile generation see https://github.com/webrecorder/browsertrix-crawler#creating-and-using-browser-profiles)."}, "profile": {"default": None, "help": "browsertrix-profile (for profile generation see https://github.com/webrecorder/browsertrix-crawler#creating-and-using-browser-profiles)."},
"docker_commands": {"default": None, "help":"if a custom docker invocation is needed"},
"timeout": {"default": 120, "help": "timeout for WACZ generation in seconds"}, "timeout": {"default": 120, "help": "timeout for WACZ generation in seconds"},
"extract_media": {"default": True, "help": "If enabled all the images/videos/audio present in the WACZ archive will be extracted into separate Media. The .wacz file will be kept untouched."} "extract_media": {"default": False, "help": "If enabled all the images/videos/audio present in the WACZ archive will be extracted into separate Media and appear in the html report. The .wacz file will be kept untouched."},
"extract_screenshot": {"default": True, "help": "If enabled the screenshot captured by browsertrix will be extracted into separate Media and appear in the html report. The .wacz file will be kept untouched."}
} }
def download(self, item: Metadata) -> Metadata: def download(self, item: Metadata) -> Metadata:
@@ -45,52 +48,46 @@ class WaczArchiverEnricher(Enricher, Archiver):
url = to_enrich.get_url() url = to_enrich.get_url()
collection = str(uuid.uuid4())[0:8] collection = random_str(8)
browsertrix_home = os.path.abspath(ArchivingContext.get_tmp_dir()) browsertrix_home_host = os.environ.get('BROWSERTRIX_HOME_HOST') or os.path.abspath(ArchivingContext.get_tmp_dir())
browsertrix_home_container = os.environ.get('BROWSERTRIX_HOME_CONTAINER') or browsertrix_home_host
if os.getenv('RUNNING_IN_DOCKER'): cmd = [
"crawl",
"--url", url,
"--scopeType", "page",
"--generateWACZ",
"--text",
"--screenshot", "fullPage",
"--collection", collection,
"--id", collection,
"--saveState", "never",
"--behaviors", "autoscroll,autoplay,autofetch,siteSpecific",
"--behaviorTimeout", str(self.timeout),
"--timeout", str(self.timeout)]
# call docker if explicitly enabled or we are running on the host (not in docker)
use_docker = os.environ.get('WACZ_ENABLE_DOCKER') or not os.environ.get('RUNNING_IN_DOCKER')
if use_docker:
logger.debug(f"generating WACZ in Docker for {url=}")
logger.debug(f"{browsertrix_home_host=} {browsertrix_home_container=}")
if self.docker_commands:
cmd = self.docker_commands + cmd
else:
cmd = ["docker", "run", "--rm", "-v", f"{browsertrix_home_host}:/crawls/", "webrecorder/browsertrix-crawler"] + cmd
if self.profile:
profile_fn = os.path.join(browsertrix_home_container, "profile.tar.gz")
logger.debug(f"copying {self.profile} to {profile_fn}")
shutil.copyfile(self.profile, profile_fn)
cmd.extend(["--profile", os.path.join("/crawls", "profile.tar.gz")])
else:
logger.debug(f"generating WACZ without Docker for {url=}") logger.debug(f"generating WACZ without Docker for {url=}")
cmd = [
"crawl",
"--url", url,
"--scopeType", "page",
"--generateWACZ",
"--text",
"--screenshot", "fullPage",
"--collection", collection,
"--id", collection,
"--saveState", "never",
"--behaviors", "autoscroll,autoplay,autofetch,siteSpecific",
"--behaviorTimeout", str(self.timeout),
"--timeout", str(self.timeout)]
if self.profile: if self.profile:
cmd.extend(["--profile", os.path.join("/app", str(self.profile))]) cmd.extend(["--profile", os.path.join("/app", str(self.profile))])
else:
logger.debug(f"generating WACZ in Docker for {url=}")
cmd = [
"docker", "run",
"--rm", # delete container once it has completed running
"-v", f"{browsertrix_home}:/crawls/",
# "-it", # this leads to "the input device is not a TTY"
"webrecorder/browsertrix-crawler", "crawl",
"--url", url,
"--scopeType", "page",
"--generateWACZ",
"--text",
"--screenshot", "fullPage",
"--collection", collection,
"--behaviors", "autoscroll,autoplay,autofetch,siteSpecific",
"--behaviorTimeout", str(self.timeout),
"--timeout", str(self.timeout)
]
if self.profile:
profile_fn = os.path.join(browsertrix_home, "profile.tar.gz")
shutil.copyfile(self.profile, profile_fn)
cmd.extend(["--profile", os.path.join("/crawls", "profile.tar.gz")])
try: try:
logger.info(f"Running browsertrix-crawler: {' '.join(cmd)}") logger.info(f"Running browsertrix-crawler: {' '.join(cmd)}")
@@ -99,18 +96,36 @@ class WaczArchiverEnricher(Enricher, Archiver):
logger.error(f"WACZ generation failed: {e}") logger.error(f"WACZ generation failed: {e}")
return False return False
if os.getenv('RUNNING_IN_DOCKER'): if use_docker:
filename = os.path.join("collections", collection, f"{collection}.wacz") wacz_fn = os.path.join(browsertrix_home_container, "collections", collection, f"{collection}.wacz")
else: else:
filename = os.path.join(browsertrix_home, "collections", collection, f"{collection}.wacz") wacz_fn = os.path.join("collections", collection, f"{collection}.wacz")
if not os.path.exists(filename): if not os.path.exists(wacz_fn):
logger.warning(f"Unable to locate and upload WACZ {filename=}") logger.warning(f"Unable to locate and upload WACZ {wacz_fn=}")
return False return False
to_enrich.add_media(Media(filename), "browsertrix") to_enrich.add_media(Media(wacz_fn), "browsertrix")
if self.extract_media: if self.extract_media or self.extract_screenshot:
self.extract_media_from_wacz(to_enrich, filename) self.extract_media_from_wacz(to_enrich, wacz_fn)
if use_docker:
jsonl_fn = os.path.join(browsertrix_home_container, "collections", collection, "pages", "pages.jsonl")
else:
jsonl_fn = os.path.join("collections", collection, "pages", "pages.jsonl")
if not os.path.exists(jsonl_fn):
logger.warning(f"Unable to locate and pages.jsonl {jsonl_fn=}")
else:
logger.info(f"Parsing pages.jsonl {jsonl_fn=}")
with jsonlines.open(jsonl_fn) as reader:
for obj in reader:
if 'title' in obj:
to_enrich.set_title(obj['title'])
if 'text' in obj:
to_enrich.set_content(obj['text'])
return True return True
def extract_media_from_wacz(self, to_enrich: Metadata, wacz_filename: str) -> None: def extract_media_from_wacz(self, to_enrich: Metadata, wacz_filename: str) -> None:
@@ -141,12 +156,13 @@ class WaczArchiverEnricher(Enricher, Archiver):
with open(warc_filename, 'rb') as warc_stream: with open(warc_filename, 'rb') as warc_stream:
for record in ArchiveIterator(warc_stream): for record in ArchiveIterator(warc_stream):
# only include fetched resources # only include fetched resources
if record.rec_type == "resource": # screenshots if record.rec_type == "resource" and self.extract_screenshot: # screenshots
fn = os.path.join(tmp_dir, f"warc-file-{counter}.png") fn = os.path.join(tmp_dir, f"warc-file-{counter}.png")
with open(fn, "wb") as outf: outf.write(record.raw_stream.read()) with open(fn, "wb") as outf: outf.write(record.raw_stream.read())
m = Media(filename=fn) m = Media(filename=fn)
to_enrich.add_media(m, "browsertrix-screenshot") to_enrich.add_media(m, "browsertrix-screenshot")
counter += 1 counter += 1
if not self.extract_media: continue
if record.rec_type != 'response': continue if record.rec_type != 'response': continue
record_url = record.rec_headers.get_header('WARC-Target-URI') record_url = record.rec_headers.get_header('WARC-Target-URI')

View File

@@ -1,6 +1,6 @@
from __future__ import annotations from __future__ import annotations
from dataclasses import dataclass from dataclasses import dataclass
import mimetypes, uuid, os, pathlib import mimetypes, os, pathlib
from jinja2 import Environment, FileSystemLoader from jinja2 import Environment, FileSystemLoader
from urllib.parse import quote from urllib.parse import quote
from loguru import logger from loguru import logger
@@ -9,6 +9,7 @@ from ..version import __version__
from ..core import Metadata, Media, ArchivingContext from ..core import Metadata, Media, ArchivingContext
from . import Formatter from . import Formatter
from ..enrichers import HashEnricher from ..enrichers import HashEnricher
from ..utils.misc import random_str
@dataclass @dataclass
@@ -44,10 +45,10 @@ class HtmlFormatter(Formatter):
metadata=item.metadata, metadata=item.metadata,
version=__version__ version=__version__
) )
html_path = os.path.join(ArchivingContext.get_tmp_dir(), f"formatted{str(uuid.uuid4())}.html") html_path = os.path.join(ArchivingContext.get_tmp_dir(), f"formatted{random_str(24)}.html")
with open(html_path, mode="w", encoding="utf-8") as outf: with open(html_path, mode="w", encoding="utf-8") as outf:
outf.write(content) outf.write(content)
final_media = Media(filename=html_path) final_media = Media(filename=html_path, _mimetype="text/html")
he = HashEnricher({"hash_enricher": {"algorithm": ArchivingContext.get("hash_enricher.algorithm"), "chunksize": 1.6e7}}) he = HashEnricher({"hash_enricher": {"algorithm": ArchivingContext.get("hash_enricher.algorithm"), "chunksize": 1.6e7}})
if len(hd := he.calculate_hash(final_media.filename)): if len(hd := he.calculate_hash(final_media.filename)):

View File

@@ -101,7 +101,7 @@
<body> <body>
<div id="notification"></div> <div id="notification"></div>
<h2>Archived media for <a href="{{ url }}">{{ url }}</a></h2> <h2>Archived media for <span class="copy">{{ url }}</span> - <a href="{{ url }}">open</a></h2>
{% if title | string | length > 0 %} {% if title | string | length > 0 %}
<p><b>title:</b> '<span class="copy">{{ title }}</span>'</p> <p><b>title:</b> '<span class="copy">{{ title }}</span>'</p>
{% endif %} {% endif %}
@@ -115,7 +115,7 @@
<table class="content"> <table class="content">
<tr> <tr>
<th>about</th> <th>about</th>
<th>preview(s)</th> <th>files and preview</th>
</tr> </tr>
<tbody> <tbody>
{% for m in media %} {% for m in media %}

View File

@@ -52,7 +52,7 @@ class GDriveStorage(Storage):
else: else:
logger.debug('GD OAuth Token valid') logger.debug('GD OAuth Token valid')
else: else:
gd_service_account = config.service_account gd_service_account = self.service_account
logger.debug(f'Using GD Service Account {gd_service_account}') logger.debug(f'Using GD Service Account {gd_service_account}')
creds = service_account.Credentials.from_service_account_file(gd_service_account, scopes=SCOPES) creds = service_account.Credentials.from_service_account_file(gd_service_account, scopes=SCOPES)
@@ -87,15 +87,6 @@ class GDriveStorage(Storage):
file_id = self._get_id_from_parent_and_name(folder_id, filename) file_id = self._get_id_from_parent_and_name(folder_id, filename)
return f"https://drive.google.com/file/d/{file_id}/view?usp=sharing" return f"https://drive.google.com/file/d/{file_id}/view?usp=sharing"
def upload(self, media: Media, **kwargs) -> bool:
# override parent so that we can use shutil.copy2 and keep metadata
dest = os.path.join(self.save_to, media.key)
os.makedirs(os.path.dirname(dest), exist_ok=True)
logger.debug(f'[{self.__class__.name}] storing file {media.filename} with key {media.key} to {dest}')
res = shutil.copy2(media.filename, dest)
logger.info(res)
return True
def upload(self, media: Media, **kwargs) -> bool: def upload(self, media: Media, **kwargs) -> bool:
logger.debug(f'[{self.__class__.name}] storing file {media.filename} with key {media.key}') logger.debug(f'[{self.__class__.name}] storing file {media.filename} with key {media.key}')
""" """
@@ -119,7 +110,7 @@ class GDriveStorage(Storage):
'parents': [upload_to] 'parents': [upload_to]
} }
media = MediaFileUpload(media.filename, resumable=True) media = MediaFileUpload(media.filename, resumable=True)
gd_file = self.service.files().create(body=file_metadata, media_body=media, fields='id').execute() gd_file = self.service.files().create(supportsAllDrives=True, body=file_metadata, media_body=media, fields='id').execute()
logger.debug(f'uploadf: uploaded file {gd_file["id"]} successfully in folder={upload_to}') logger.debug(f'uploadf: uploaded file {gd_file["id"]} successfully in folder={upload_to}')
# must be implemented even if unused # must be implemented even if unused
@@ -150,6 +141,9 @@ class GDriveStorage(Storage):
for attempt in range(retries): for attempt in range(retries):
results = self.service.files().list( results = self.service.files().list(
# both below for Google Shared Drives
supportsAllDrives=True,
includeItemsFromAllDrives=True,
q=query_string, q=query_string,
spaces='drive', # ie not appDataFolder or photos spaces='drive', # ie not appDataFolder or photos
fields='files(id, name)' fields='files(id, name)'
@@ -182,7 +176,7 @@ class GDriveStorage(Storage):
'mimeType': 'application/vnd.google-apps.folder', 'mimeType': 'application/vnd.google-apps.folder',
'parents': [parent_id] 'parents': [parent_id]
} }
gd_folder = self.service.files().create(body=file_metadata, fields='id').execute() gd_folder = self.service.files().create(supportsAllDrives=True, body=file_metadata, fields='id').execute()
return gd_folder.get('id') return gd_folder.get('id')
# def exists(self, key): # def exists(self, key):

View File

@@ -1,14 +1,14 @@
from typing import IO, Any from typing import IO
import boto3, uuid, os, mimetypes import boto3, os
from botocore.errorfactory import ClientError
from ..core import Metadata from ..utils.misc import random_str
from ..core import Media from ..core import Media
from ..storages import Storage from ..storages import Storage
from ..enrichers import HashEnricher
from loguru import logger from loguru import logger
from slugify import slugify
NO_DUPLICATES_FOLDER = "no-dups/"
class S3Storage(Storage): class S3Storage(Storage):
name = "s3_storage" name = "s3_storage"
@@ -21,6 +21,9 @@ class S3Storage(Storage):
aws_access_key_id=self.key, aws_access_key_id=self.key,
aws_secret_access_key=self.secret aws_secret_access_key=self.secret
) )
self.random_no_duplicate = bool(self.random_no_duplicate)
if self.random_no_duplicate:
logger.warning("random_no_duplicate is set to True, this will override `path_generator`, `filename_generator` and `folder`.")
@staticmethod @staticmethod
def configs() -> dict: def configs() -> dict:
@@ -31,7 +34,7 @@ class S3Storage(Storage):
"region": {"default": None, "help": "S3 region name"}, "region": {"default": None, "help": "S3 region name"},
"key": {"default": None, "help": "S3 API key"}, "key": {"default": None, "help": "S3 API key"},
"secret": {"default": None, "help": "S3 API secret"}, "secret": {"default": None, "help": "S3 API secret"},
# TODO: how to have sth like a custom folder? has to come from the feeders "random_no_duplicate": {"default": False, "help": f"if set, it will override `path_generator`, `filename_generator` and `folder`. It will check if the file already exists and if so it will not upload it again. Creates a new root folder path `{NO_DUPLICATES_FOLDER}`"},
"endpoint_url": { "endpoint_url": {
"default": 'https://{region}.digitaloceanspaces.com', "default": 'https://{region}.digitaloceanspaces.com',
"help": "S3 bucket endpoint, {region} are inserted at runtime" "help": "S3 bucket endpoint, {region} are inserted at runtime"
@@ -47,6 +50,8 @@ class S3Storage(Storage):
return self.cdn_url.format(bucket=self.bucket, region=self.region, key=media.key) return self.cdn_url.format(bucket=self.bucket, region=self.region, key=media.key)
def uploadf(self, file: IO[bytes], media: Media, **kwargs: dict) -> None: def uploadf(self, file: IO[bytes], media: Media, **kwargs: dict) -> None:
if not self.is_upload_needed(media): return True
extra_args = kwargs.get("extra_args", {}) extra_args = kwargs.get("extra_args", {})
if not self.private and 'ACL' not in extra_args: if not self.private and 'ACL' not in extra_args:
extra_args['ACL'] = 'public-read' extra_args['ACL'] = 'public-read'
@@ -60,14 +65,31 @@ class S3Storage(Storage):
self.s3.upload_fileobj(file, Bucket=self.bucket, Key=media.key, ExtraArgs=extra_args) self.s3.upload_fileobj(file, Bucket=self.bucket, Key=media.key, ExtraArgs=extra_args)
return True return True
def is_upload_needed(self, media: Media) -> bool:
if self.random_no_duplicate:
# checks if a folder with the hash already exists, if so it skips the upload
he = HashEnricher({"hash_enricher": {"algorithm": "SHA-256", "chunksize": 1.6e7}})
hd = he.calculate_hash(media.filename)
path = os.path.join(NO_DUPLICATES_FOLDER, hd[:24])
if existing_key:=self.file_in_folder(path):
media.key = existing_key
media.set("previously archived", True)
logger.debug(f"skipping upload of {media.filename} because it already exists in {media.key}")
return False
_, ext = os.path.splitext(media.key)
media.key = os.path.join(path, f"{random_str(24)}{ext}")
return True
def file_in_folder(self, path:str) -> str:
# checks if path exists and is not an empty folder
if not path.endswith('/'):
path = path + '/'
resp = self.s3.list_objects(Bucket=self.bucket, Prefix=path, Delimiter='/', MaxKeys=1)
if 'Contents' in resp:
return resp['Contents'][0]['Key']
return False
# def exists(self, key: str) -> bool:
# """
# Tests if a given file with key=key exists in the bucket
# """
# try:
# self.s3.head_object(Bucket=self.bucket, Key=key)
# return True
# except ClientError as e:
# logger.warning(f"got a ClientError when checking if {key=} exists in bucket={self.bucket}: {e}")
# return False

View File

@@ -2,11 +2,13 @@ from __future__ import annotations
from abc import abstractmethod from abc import abstractmethod
from dataclasses import dataclass from dataclasses import dataclass
from typing import IO from typing import IO
import os
from ..utils.misc import random_str
from ..core import Media, Step, ArchivingContext from ..core import Media, Step, ArchivingContext
from ..enrichers import HashEnricher from ..enrichers import HashEnricher
from loguru import logger from loguru import logger
import os, uuid
from slugify import slugify from slugify import slugify
@@ -72,10 +74,10 @@ class Storage(Step):
filename = slugify(filename) # in case it comes with os.sep filename = slugify(filename) # in case it comes with os.sep
elif self.path_generator == "url": path = slugify(url) elif self.path_generator == "url": path = slugify(url)
elif self.path_generator == "random": elif self.path_generator == "random":
path = ArchivingContext.get("random_path", str(uuid.uuid4())[:16], True) path = ArchivingContext.get("random_path", random_str(24), True)
# filename_generator logic # filename_generator logic
if self.filename_generator == "random": filename = str(uuid.uuid4())[:16] if self.filename_generator == "random": filename = random_str(24)
elif self.filename_generator == "static": elif self.filename_generator == "static":
he = HashEnricher({"hash_enricher": {"algorithm": ArchivingContext.get("hash_enricher.algorithm"), "chunksize": 1.6e7}}) he = HashEnricher({"hash_enricher": {"algorithm": ArchivingContext.get("hash_enricher.algorithm"), "chunksize": 1.6e7}})
hd = he.calculate_hash(media.filename) hd = he.calculate_hash(media.filename)

View File

@@ -1,5 +1,6 @@
import os, json, requests import os, json, requests
import uuid
from datetime import datetime from datetime import datetime
from loguru import logger from loguru import logger
@@ -49,3 +50,7 @@ def update_nested_dict(dictionary, update_dict):
update_nested_dict(dictionary[key], value) update_nested_dict(dictionary[key], value)
else: else:
dictionary[key] = value dictionary[key] = value
def random_str(length: int = 32) -> str:
assert length <= 32, "length must be less than 32 as UUID4 is used"
return str(uuid.uuid4()).replace("-", "")[:length]

View File

@@ -65,6 +65,9 @@ class UrlUtil:
if "vk.com/images/" in url: return False if "vk.com/images/" in url: return False
if "vk.com/images/reaction/" in url: return False if "vk.com/images/reaction/" in url: return False
# wikipedia
if "wikipedia.org/static" in url: return False
return True return True
@staticmethod @staticmethod

View File

@@ -15,7 +15,7 @@ class Webdriver:
def __enter__(self) -> webdriver: def __enter__(self) -> webdriver:
options = webdriver.FirefoxOptions() options = webdriver.FirefoxOptions()
options.headless = True options.add_argument("--headless")
options.set_preference('network.protocol-handler.external.tg', False) options.set_preference('network.protocol-handler.external.tg', False)
try: try:
self.driver = webdriver.Firefox(options=options) self.driver = webdriver.Firefox(options=options)

View File

@@ -1,9 +1,9 @@
_MAJOR = "0" _MAJOR = "0"
_MINOR = "6" _MINOR = "8"
# On main and in a nightly release the patch should be one ahead of the last # On main and in a nightly release the patch should be one ahead of the last
# released build. # released build.
_PATCH = "5" _PATCH = "1"
# This is mainly for nightly builds which have the suffix ".dev$DATE". See # This is mainly for nightly builds which have the suffix ".dev$DATE". See
# https://semver.org/#is-v123-a-semantic-version for the semantics. # https://semver.org/#is-v123-a-semantic-version for the semantics.
_SUFFIX = "" _SUFFIX = ""