version bump and release without commit

Updated gd.py to work with shared folders (#102 )
Co-authored-by: msramalho <19508417+msramalho@users.noreply.github.com>
2026-06-12 13:18:28 +03:00 · 2023-09-22 10:18:58 +01:00 · 2023-09-22 10:17:54 +01:00 · 2023-09-20 10:18:44 +01:00 · 2023-09-20 10:17:31 +01:00 · 2023-09-15 19:53:07 +01:00
15 changed files with 464 additions and 428 deletions
--- a/11
+++ b/11
@@ -4,7 +4,6 @@ ENV RUNNING_IN_DOCKER=1
 WORKDIR /app
 # TODO: use custom ffmpeg builds instead of apt-get install
 RUN pip install --upgrade pip && \
 	pip install pipenv && \
 	add-apt-repository ppa:mozillateam/ppa && \
@@ -18,18 +17,14 @@ RUN pip install --upgrade pip && \
 	rm geckodriver-v*
 # TODO: avoid copying unnecessary files, including .git
 COPY Pipfile* ./
-RUN pipenv install
+# install from pipenv, with browsertrix-only requirements
 RUN pipenv install && \
 	pipenv install pywb uwsgi
 # doing this at the end helps during development, builds are quick
 COPY ./src/ . 
 # TODO: figure out how to make volumes not be root, does it depend on host or dockerfile?
 # RUN useradd --system --groups sudo --shell /bin/bash archiver && chown -R archiver:sudo .
 # USER archiver
 ENTRYPOINT ["pipenv", "run", "python3", "-m", "auto_archiver"]
 # should be executed with 2 volumes (3 if local_storage is used)
--- a/8
+++ b/8
@@ -32,15 +32,13 @@ cryptography = "*"
 dataclasses-json = "*"
 yt-dlp = "*"
 vk-url-scraper = "*"
 uwsgi = "*"
 requests = {extras = ["socks"], version = "*"}
 # wacz = "==0.4.8"
 numpy = "*"
 warcio = "*"
 [requires]
 python_version = "3.10"
 [dev-packages]
 autopep8 = "*"
 setuptools-pipfile = "*"
 [requires]
 python_version = "3.10"
--- a/Pipfile.lock
+++ b/Pipfile.lock
--- a/scripts/release.sh
+++ b/scripts/release.sh
@@ -8,8 +8,8 @@ TAG=$(python -c 'from src.auto_archiver.version import __version__; print("v" +
 read -p "Creating new release for $TAG. Do you want to continue? [Y/n] " prompt
 if [[ $prompt == "y" || $prompt == "Y" || $prompt == "yes" || $prompt == "Yes" ]]; then
-    git add -A
+    # git add -A
-    git commit -m "Bump version to $TAG for release" || true && git push
+    # git commit -m "Bump version to $TAG for release" || true && git push
    echo "Creating new git tag $TAG"
    git tag "$TAG" -m "$TAG"
    git push --tags
--- a/src/auto_archiver/archivers/twitter_archiver.py
+++ b/src/auto_archiver/archivers/twitter_archiver.py
@@ -15,8 +15,8 @@ class TwitterArchiver(Archiver):
    """
    name = "twitter_archiver"
-    link_pattern = re.compile(r"twitter.com\/(?:\#!\/)?(\w+)\/status(?:es)?\/(\d+)")
+    link_pattern = re.compile(r"(?:twitter|x).com\/(?:\#!\/)?(\w+)\/status(?:es)?\/(\d+)")
-    link_clean_pattern = re.compile(r"(.+twitter\.com\/.+\/\d+)(\?)*.*")
+    link_clean_pattern = re.compile(r"(.+(?:twitter|x)\.com\/.+\/\d+)(\?)*.*")
    def __init__(self, config: dict) -> None:
        super().__init__(config)
--- a/src/auto_archiver/archivers/youtubedl_archiver.py
+++ b/src/auto_archiver/archivers/youtubedl_archiver.py
@@ -25,7 +25,7 @@ class YoutubeDLArchiver(Archiver):
            logger.debug('Using Facebook cookie')
            yt_dlp.utils.std_headers['cookie'] = self.facebook_cookie
-        ydl = yt_dlp.YoutubeDL({'outtmpl': os.path.join(ArchivingContext.get_tmp_dir(), f'%(id)s.%(ext)s'), 'quiet': False})
+        ydl = yt_dlp.YoutubeDL({'outtmpl': os.path.join(ArchivingContext.get_tmp_dir(), f'%(id)s.%(ext)s'), 'quiet': False, 'noplaylist': True})
        try:
            # don'd download since it can be a live stream
--- a/src/auto_archiver/core/media.py
+++ b/src/auto_archiver/core/media.py
@@ -89,7 +89,7 @@ class Media:
        try:
            streams = ffmpeg.probe(self.filename, select_streams='v')['streams']
            logger.warning(f"STREAMS FOR {self.filename} {streams}")
-            return any(s.get("duration_ts") > 0 for s in streams)
+            return any(s.get("duration_ts", 0) > 0 for s in streams)
        except Error: return False # ffmpeg errors when reading bad files
        except Exception as e:
            logger.error(e)
--- a/src/auto_archiver/enrichers/pdq_hash_enricher.py
+++ b/src/auto_archiver/enrichers/pdq_hash_enricher.py
@@ -26,11 +26,16 @@ class PdqHashEnricher(Enricher):
    def enrich(self, to_enrich: Metadata) -> None:
        url = to_enrich.get_url()
        logger.debug(f"calculating perceptual hashes for {url=}")
        media_with_hashes = []
        for m in to_enrich.media:
            for media in m.all_inner_media(True):
-                if media.is_image() and "screenshot" not in media.get("id") and "warc-file-" not in media.get("id") and len(hd := self.calculate_pdq_hash(media.filename)):
+                media_id = media.get("id", "")
                if media.is_image() and "screenshot" not in media_id and "warc-file-" not in media_id and len(hd := self.calculate_pdq_hash(media.filename)):
                    media.set("pdq_hash", hd)
                    media_with_hashes.append(media.filename)
        logger.debug(f"calculated '{len(media_with_hashes)}' perceptual hashes for {url=}: {media_with_hashes}")
    def calculate_pdq_hash(self, filename):
        # returns a hexadecimal string with the perceptual hash for the given filename
--- a/src/auto_archiver/enrichers/wacz_enricher.py
+++ b/src/auto_archiver/enrichers/wacz_enricher.py
@@ -27,6 +27,7 @@ class WaczArchiverEnricher(Enricher, Archiver):
    def configs() -> dict:
        return {
            "profile": {"default": None, "help": "browsertrix-profile (for profile generation see https://github.com/webrecorder/browsertrix-crawler#creating-and-using-browser-profiles)."},
            "docker_commands": {"default": None, "help":"if a custom docker invocation is needed"},
            "timeout": {"default": 120, "help": "timeout for WACZ generation in seconds"},
            "extract_media": {"default": True, "help": "If enabled all the images/videos/audio present in the WACZ archive will be extracted into separate Media. The .wacz file will be kept untouched."}
        }
@@ -46,51 +47,45 @@ class WaczArchiverEnricher(Enricher, Archiver):
        url = to_enrich.get_url()
        collection = str(uuid.uuid4())[0:8]
-        browsertrix_home = os.path.abspath(ArchivingContext.get_tmp_dir())
+        browsertrix_home_host = os.environ.get('BROWSERTRIX_HOME_HOST') or os.path.abspath(ArchivingContext.get_tmp_dir())
        browsertrix_home_container = os.environ.get('BROWSERTRIX_HOME_CONTAINER') or browsertrix_home_host
-        if os.getenv('RUNNING_IN_DOCKER'):
+        cmd = [
            "crawl",
            "--url", url,
            "--scopeType", "page",
            "--generateWACZ",
            "--text",
            "--screenshot", "fullPage",
            "--collection", collection,
            "--id", collection,
            "--saveState", "never",
            "--behaviors", "autoscroll,autoplay,autofetch,siteSpecific",
            "--behaviorTimeout", str(self.timeout),
            "--timeout", str(self.timeout)]
        # call docker if explicitly enabled or we are running on the host (not in docker)
        use_docker = os.environ.get('WACZ_ENABLE_DOCKER') or not os.environ.get('RUNNING_IN_DOCKER')
        if use_docker:
            logger.debug(f"generating WACZ in Docker for {url=}")
            logger.debug(f"{browsertrix_home_host=} {browsertrix_home_container=}")
            if self.docker_commands:
                cmd = self.docker_commands + cmd
            else:
                cmd = ["docker", "run", "--rm", "-v", f"{browsertrix_home_host}:/crawls/", "webrecorder/browsertrix-crawler"] + cmd
            if self.profile:
                profile_fn = os.path.join(browsertrix_home_container, "profile.tar.gz")
                logger.debug(f"copying {self.profile} to {profile_fn}")
                shutil.copyfile(self.profile, profile_fn)
                cmd.extend(["--profile", os.path.join("/crawls", "profile.tar.gz")])
        else:
            logger.debug(f"generating WACZ without Docker for {url=}")
            cmd = [
                "crawl",
                "--url", url,
                "--scopeType", "page",
                "--generateWACZ",
                "--text",
                "--screenshot", "fullPage",
                "--collection", collection,
                "--id", collection,
                "--saveState", "never",
                "--behaviors", "autoscroll,autoplay,autofetch,siteSpecific",
                "--behaviorTimeout", str(self.timeout),
                "--timeout", str(self.timeout)]
            if self.profile:
                cmd.extend(["--profile", os.path.join("/app", str(self.profile))])
        else:
            logger.debug(f"generating WACZ in Docker for {url=}")
            cmd = [
                "docker", "run",
                "--rm",  # delete container once it has completed running
                "-v", f"{browsertrix_home}:/crawls/",
                # "-it", # this leads to "the input device is not a TTY"
                "webrecorder/browsertrix-crawler", "crawl",
                "--url", url,
                "--scopeType", "page",
                "--generateWACZ",
                "--text",
                "--screenshot", "fullPage",
                "--collection", collection,
                "--behaviors", "autoscroll,autoplay,autofetch,siteSpecific",
                "--behaviorTimeout", str(self.timeout),
                "--timeout", str(self.timeout)
            ]
            if self.profile:
                profile_fn = os.path.join(browsertrix_home, "profile.tar.gz")
                shutil.copyfile(self.profile, profile_fn)
                cmd.extend(["--profile", os.path.join("/crawls", "profile.tar.gz")])
        try:
            logger.info(f"Running browsertrix-crawler: {' '.join(cmd)}")
@@ -99,18 +94,18 @@ class WaczArchiverEnricher(Enricher, Archiver):
            logger.error(f"WACZ generation failed: {e}")
            return False
-        if os.getenv('RUNNING_IN_DOCKER'):
+        if use_docker:
-            filename = os.path.join("collections", collection, f"{collection}.wacz")
+            wacz_fn = os.path.join(browsertrix_home_container, "collections", collection, f"{collection}.wacz")
        else:
-            filename = os.path.join(browsertrix_home, "collections", collection, f"{collection}.wacz")
+            wacz_fn = os.path.join("collections", collection, f"{collection}.wacz")
-        if not os.path.exists(filename):
+        if not os.path.exists(wacz_fn):
-            logger.warning(f"Unable to locate and upload WACZ  {filename=}")
+            logger.warning(f"Unable to locate and upload WACZ  {wacz_fn=}")
            return False
-        to_enrich.add_media(Media(filename), "browsertrix")
+        to_enrich.add_media(Media(wacz_fn), "browsertrix")
        if self.extract_media:
-            self.extract_media_from_wacz(to_enrich, filename)
+            self.extract_media_from_wacz(to_enrich, wacz_fn)
        return True
    def extract_media_from_wacz(self, to_enrich: Metadata, wacz_filename: str) -> None:
--- a/src/auto_archiver/enrichers/wayback_enricher.py
+++ b/src/auto_archiver/enrichers/wayback_enricher.py
@@ -23,6 +23,7 @@ class WaybackArchiverEnricher(Enricher, Archiver):
    def configs() -> dict:
        return {
            "timeout": {"default": 15, "help": "seconds to wait for successful archive confirmation from wayback, if more than this passes the result contains the job_id so the status can later be checked manually."},
            "if_not_archived_within": {"default": None, "help": "only tell wayback to archive if no archive is available before the number of seconds specified, use None to ignore this option. For more information: https://docs.google.com/document/d/1Nsv52MvSjbLb2PCpHlat0gkzw0EvtSgpKHu4mk0MnrA"},
            "key": {"default": None, "help": "wayback API key. to get credentials visit https://archive.org/account/s3.php"},
            "secret": {"default": None, "help": "wayback API secret. to get credentials visit https://archive.org/account/s3.php"}
        }
@@ -50,7 +51,11 @@ class WaybackArchiverEnricher(Enricher, Archiver):
            "Accept": "application/json",
            "Authorization": f"LOW {self.key}:{self.secret}"
        }
-        r = requests.post('https://web.archive.org/save/', headers=ia_headers, data={'url': url})
+        post_data = {'url': url}
        if self.if_not_archived_within:
            post_data["if_not_archived_within"] = self.if_not_archived_within
        # see https://docs.google.com/document/d/1Nsv52MvSjbLb2PCpHlat0gkzw0EvtSgpKHu4mk0MnrA for more options
        r = requests.post('https://web.archive.org/save/', headers=ia_headers, data=post_data)
        if r.status_code != 200:
            logger.error(em := f"Internet archive failed with status of {r.status_code}: {r.json()}")
--- a/src/auto_archiver/enrichers/whisper_enricher.py
+++ b/src/auto_archiver/enrichers/whisper_enricher.py
@@ -18,17 +18,18 @@ class WhisperEnricher(Enricher):
    def __init__(self, config: dict) -> None:
        # without this STEP.__init__ is not called
        super().__init__(config)
        assert type(self.api_endpoint) == str and len(self.api_endpoint) > 0, "please provide a value for the whisper_enricher api_endpoint"
        assert type(self.api_key) == str and len(self.api_key) > 0, "please provide a value for the whisper_enricher api_key"
        self.timeout = int(self.timeout)
    @staticmethod
    def configs() -> dict:
        return {
-            "api_endpoint": {"default": "https://whisper.spoettel.dev/api/v1", "help": "WhisperApi api endpoint"},
+            "api_endpoint": {"default": None, "help": "WhisperApi api endpoint, eg: https://whisperbox-api.com/api/v1, a deployment of https://github.com/bellingcat/whisperbox-transcribe."},
            "api_key": {"default": None, "help": "WhisperApi api key for authentication"},
            "include_srt": {"default": False, "help": "Whether to include a subtitle SRT (SubRip Subtitle file) for the video (can be used in video players)."},
            "timeout": {"default": 90, "help": "How many seconds to wait at most for a successful job completion."},
-            "action": {"default": "translation", "help": "which Whisper operation to execute", "choices": ["transcript", "translation", "language_detection"]},
+            "action": {"default": "translate", "help": "which Whisper operation to execute", "choices": ["transcribe", "translate", "language_detection"]},
        }
@@ -56,9 +57,12 @@ class WhisperEnricher(Enricher):
        for i, m in enumerate(to_enrich.media):
            if m.is_video() or m.is_audio():
-                job_id = to_enrich.media[i].get("whisper_model")["job_id"]
+                job_id = to_enrich.media[i].get("whisper_model", {}).get("job_id")
                if not job_id: continue
                to_enrich.media[i].set("whisper_model", {
                    "job_id": job_id,
                    "job_status_check": f"{self.api_endpoint}/jobs/{job_id}",
                    "job_artifacts_check": f"{self.api_endpoint}/jobs/{job_id}/artifacts",
                    **(job_results[job_id] if job_results[job_id] else {"result": "incomplete or failed job"})
                })
                # append the extracted text to the content of the post so it gets written to the DBs like gsheets text column
@@ -76,6 +80,7 @@ class WhisperEnricher(Enricher):
            "type": self.action,
            # "language": "string" # may be a config
        }
        logger.debug(f"calling API with {payload=}")
        response = requests.post(f'{self.api_endpoint}/jobs', json=payload, headers={'Authorization': f'Bearer {self.api_key}'})
        assert response.status_code == 201, f"calling the whisper api {self.api_endpoint} returned a non-success code: {response.status_code}"
        logger.debug(response.json())
--- a/src/auto_archiver/formatters/templates/macros.html
+++ b/src/auto_archiver/formatters/templates/macros.html
@@ -16,7 +16,7 @@ No URL available for {{ m.key }}.
        <a href="https://lens.google.com/uploadbyurl?url={{ url | quote }}">Google Lens</a>,&nbsp;
        <a href="https://yandex.ru/images/touch/search?rpt=imageview&url={{ url | quote }}">Yandex</a>,&nbsp;
        <a href="https://www.bing.com/images/search?view=detailv2&iss=sbi&form=SBIVSP&sbisrc=UrlPaste&q=imgurl:{{ url | quote }}">Bing</a>,&nbsp;
-        <a href="https://www.tineye.com/search/?url={{ url | quote }}">Tineye</a>,&nbsp;
+        <a href="https://www.tineye.com/search/?url={{ url | quote }}">Tineye</a>
    </div>
    <p></p>
 </div>
--- a/src/auto_archiver/storages/gd.py
+++ b/src/auto_archiver/storages/gd.py
@@ -119,7 +119,7 @@ class GDriveStorage(Storage):
            'parents': [upload_to]
        }
        media = MediaFileUpload(media.filename, resumable=True)
-        gd_file = self.service.files().create(body=file_metadata, media_body=media, fields='id').execute()
+        gd_file = self.service.files().create(supportsAllDrives=True, body=file_metadata, media_body=media, fields='id').execute()
        logger.debug(f'uploadf: uploaded file {gd_file["id"]} successfully in folder={upload_to}')
    # must be implemented even if unused
@@ -150,6 +150,9 @@ class GDriveStorage(Storage):
        for attempt in range(retries):
            results = self.service.files().list(
                # both below for Google Shared Drives
                supportsAllDrives=True,
                includeItemsFromAllDrives=True,
                q=query_string,
                spaces='drive',  # ie not appDataFolder or photos
                fields='files(id, name)'
@@ -182,7 +185,7 @@ class GDriveStorage(Storage):
            'mimeType': 'application/vnd.google-apps.folder',
            'parents': [parent_id]
        }
-        gd_folder = self.service.files().create(body=file_metadata, fields='id').execute()
+        gd_folder = self.service.files().create(supportsAllDrives=True, body=file_metadata, fields='id').execute()
        return gd_folder.get('id')
    # def exists(self, key):
--- a/src/auto_archiver/utils/url.py
+++ b/src/auto_archiver/utils/url.py
@@ -52,6 +52,19 @@ class UrlUtil:
        # telegram
        if "https://telegram.org/img/emoji/" in url: return False
        # youtube
        if "https://www.youtube.com/s/gaming/emoji/" in url: return False
        if "https://yt3.ggpht.com" in url and "default-user=" in url: return False
        if "https://www.youtube.com/s/search/audio/" in url: return False
        # ok
        if " https://ok.ru/res/i/" in url: return False
        # vk
        if "https://vk.com/emoji/" in url: return False
        if "vk.com/images/" in url: return False
        if "vk.com/images/reaction/" in url: return False
        return True
    @staticmethod
--- a/src/auto_archiver/version.py
+++ b/src/auto_archiver/version.py
@@ -3,7 +3,7 @@ _MAJOR = "0"
 _MINOR = "6"
 # On main and in a nightly release the patch should be one ahead of the last
 # released build.
-_PATCH = "2"
+_PATCH = "13"
 # This is mainly for nightly builds which have the suffix ".dev$DATE". See
 # https://semver.org/#is-v123-a-semantic-version for the semantics.
 _SUFFIX = ""
Author	SHA1	Message	Date
msramalho	1382f8b795	version bump and release without commit	2023-09-22 10:18:58 +01:00
Dave Mateer	fac8364762	Updated gd.py to work with shared folders (#102 ) Co-authored-by: msramalho <19508417+msramalho@users.noreply.github.com>	2023-09-22 10:17:54 +01:00
msramalho	0feeb0bd24	Bump version to v0.6.12 for release	2023-09-20 10:18:44 +01:00
msramalho	ddb9dc87d7	unfortunately needed twitter->x	2023-09-20 10:17:31 +01:00
msramalho	e8935b9a80	Bump version to v0.6.11 for release	2023-09-15 19:53:07 +01:00
msramalho	b157f9a6b1	renaming variable	2023-09-15 19:52:47 +01:00
msramalho	ea38a604bb	fixes #96 by not assigning to self.prop	2023-09-15 19:35:35 +01:00
msramalho	53494c961e	Bump version to v0.6.10 for release	2023-09-14 17:50:08 +01:00
Kai	f7839a99cc	Add configs for path to write and read wacz archives (#93 ) Co-authored-by: msramalho <19508417+msramalho@users.noreply.github.com>	2023-09-14 17:49:37 +01:00
msramalho	7a2119e6e9	Bump version to v0.6.9 for release	2023-09-12 20:08:00 +01:00
Miguel Sozinho Ramalho	3ae25e51e7	adds flexibile setup for wacz in docker (#94 )	2023-09-12 20:07:21 +01:00
msramalho	9584193d69	Bump version to v0.6.8 for release	2023-09-08 15:10:02 +01:00
msramalho	0dd45d90f1	fix: docker+wacz troubles	2023-09-08 15:09:50 +01:00
msramalho	edcb2da74a	Bump version to v0.6.7 for release	2023-09-06 17:07:14 +01:00
msramalho	17d9bf694f	fix docker image so as not to remove browsertrix files	2023-09-06 17:07:10 +01:00
Miguel Sozinho Ramalho	368395ffa8	Merge pull request #88 from djhmateer/v6-test	2023-08-28 11:09:28 +01:00
Miguel Sozinho Ramalho	21d7d2e16c	format youtubedl_archiver.py	2023-08-28 11:09:03 +01:00
Dave Mateer	0bbb4c9b08	Added noplaylist true to youtubedl so that videos in playlists will work	2023-08-27 17:26:36 +01:00
msramalho	a30607801f	Bump version to v0.6.6 for release	2023-08-24 17:10:16 +01:00
Miguel Sozinho Ramalho	c75d54a4ec	Merge pull request #87 from bellingcat/fix-wacz	2023-08-24 17:09:49 +01:00
msramalho	804fcb1204	browsertrix dependencies isolated into dockerfile	2023-08-24 16:57:58 +01:00
msramalho	b2adceff25	Bump version to v0.6.5 for release	2023-08-24 12:43:49 +01:00
msramalho	92a0a92b47	closes #86	2023-08-24 12:43:28 +01:00
msramalho	bf3c04b3fc	Bump version to v0.6.4 for release	2023-08-18 21:25:17 +01:00
msramalho	7eebecdb2c	update dependencies	2023-08-18 21:25:13 +01:00
msramalho	b17b5953dd	closes #59	2023-08-17 18:11:58 +01:00
msramalho	ceb717ea65	exclude vk emojis	2023-08-17 18:11:26 +01:00
msramalho	6e4fb76940	exclude ok resource images from wacz enricher	2023-08-09 11:26:46 +01:00
msramalho	810a31b1f0	fix: whisper handle error http code	2023-08-08 18:06:48 +01:00
msramalho	8b15d733b1	adds whisper endpoints	2023-08-05 14:03:57 +01:00
msramalho	ca37d54b7f	Bump version to v0.6.3 for release	2023-08-05 13:58:39 +01:00
msramalho	a1742b5565	fixing whisper enricher	2023-08-05 13:57:09 +01:00
msramalho	60a1f3a27a	minor fixes	2023-07-31 16:08:48 +01:00