version bump

refactors free twitter archiver strategies (#142 )
removes deprecated datetime method
2026-06-08 03:18:28 +03:00 · 2024-05-14 16:42:15 +01:00 · 2024-05-14 16:23:33 +01:00 · 2024-05-14 15:54:50 +01:00 · 2024-04-16 12:45:45 +01:00 · 2024-04-15 19:54:55 +01:00
31 changed files with 951 additions and 439 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -27,4 +27,5 @@ instaloader.session
 orchestration.yaml
 auto_archiver.egg-info*
 logs*
-*.csv
+*.csv
+archived/
--- a/7
+++ b/7
@@ -1,4 +1,4 @@
-FROM webrecorder/browsertrix-crawler:latest
+FROM webrecorder/browsertrix-crawler:1.0.4

 ENV RUNNING_IN_DOCKER=1

@@ -19,9 +19,8 @@ RUN pip install --upgrade pip && \

 COPY Pipfile* ./
 # install from pipenv, with browsertrix-only requirements
-RUN pipenv install && \
-	pipenv install pywb uwsgi
-	
+RUN pipenv install
+
 # doing this at the end helps during development, builds are quick
 COPY ./src/ . 

--- a/Pipfile.lock
+++ b/Pipfile.lock
--- a/README.md
+++ b/README.md
@@ -177,6 +177,38 @@ To use Google Drive storage you need the id of the shared folder in the `config.
 #### Telethon + Instagram with telegram bot
 The first time you run, you will be prompted to do a authentication with the phone number associated, alternatively you can put your `anon.session` in the root.

+#### Atlos
+When integrating with [Atlos](https://atlos.org), you will need to provide an API token in your configuration. You can learn more about Atlos and how to get an API token [here](https://docs.atlos.org/technical/api). You will have to provide this token to the `atlos_feeder`, `atlos_storage`, and `atlos_db` steps in your orchestration file. If you use a custom or self-hosted Atlos instance, you can also specify the `atlos_url` option to point to your custom instance's URL. For example:
+
+```yaml
+# orchestration.yaml content
+steps:
+  feeder: atlos_feeder
+  archivers: # order matters
+    - youtubedl_archiver
+  enrichers:
+    - thumbnail_enricher
+    - hash_enricher
+  formatter: html_formatter
+  storages:
+    - atlos_storage
+  databases:
+    - console_db
+    - atlos_db
+
+configurations:
+  atlos_feeder:
+    atlos_url: "https://platform.atlos.org" # optional
+    api_token: "...your API token..."
+  atlos_db:
+    atlos_url: "https://platform.atlos.org" # optional
+    api_token: "...your API token..."
+  atlos_storage:
+    atlos_url: "https://platform.atlos.org" # optional
+    api_token: "...your API token..."
+  hash_enricher:
+    algorithm: "SHA-256"
+```

 ## Running on Google Sheets Feeder (gsheet_feeder)
 The `--gsheet_feeder.sheet` property is the name of the Google Sheet to check for URLs. 
--- a/example.orchestration.yaml
+++ b/example.orchestration.yaml
@@ -7,6 +7,7 @@ steps:
    # - telegram_archiver
    # - twitter_archiver
    # - twitter_api_archiver
+    # - instagram_api_archiver
    # - instagram_tbot_archiver
    # - instagram_archiver
    # - tiktok_archiver
--- a/src/auto_archiver/archivers/instagram_api_archiver.py
+++ b/src/auto_archiver/archivers/instagram_api_archiver.py
@@ -186,12 +186,13 @@ class InstagramAPIArchiver(Archiver):
    def download_stories(self, result: Metadata, username: str) -> Metadata:
        now = datetime.now().strftime("%Y-%m-%d_%H-%M")
        stories = self._download_stories_reusable(result, username)
+        if stories == []: return result.success("insta no story")
        result.set_title(f"stories {username} at {now}").set("#stories", len(stories))
        return result.success(f"insta stories {now}")
    
    def _download_stories_reusable(self, result: Metadata, username: str) -> list[dict]:
        stories = self.call_api(f"v1/user/stories/by/username", {"username": username})
-        assert stories, f"Stories for {username} not found"
+        if not stories or not len(stories): return []
        stories = stories[::-1] # newest to oldest

        for s in tqdm(stories, desc="downloading stories", unit="story"):
--- a/src/auto_archiver/archivers/instagram_tbot_archiver.py
+++ b/src/auto_archiver/archivers/instagram_tbot_archiver.py
@@ -42,7 +42,7 @@ class InstagramTbotArchiver(Archiver):
        # make a copy of the session that is used exclusively with this archiver instance
        new_session_file = os.path.join("secrets/", f"instabot-{time.strftime('%Y-%m-%d')}{random_str(8)}.session")
        shutil.copy(self.session_file + ".session", new_session_file)
-        self.session_file = new_session_file
+        self.session_file = new_session_file.replace(".session", "")

        try:
            self.client = TelegramClient(self.session_file, self.api_id, self.api_hash)
@@ -54,8 +54,9 @@ class InstagramTbotArchiver(Archiver):

    def cleanup(self) -> None:
        logger.info(f"CLEANUP {self.name}.")
-        if os.path.exists(self.session_file):
-            os.remove(self.session_file)
+        session_file_name = self.session_file + ".session"
+        if os.path.exists(session_file_name):
+            os.remove(session_file_name)
        
    def download(self, item: Metadata) -> Metadata:
        url = item.get_url()
--- a/src/auto_archiver/archivers/telethon_archiver.py
+++ b/src/auto_archiver/archivers/telethon_archiver.py
@@ -49,7 +49,7 @@ class TelethonArchiver(Archiver):
        # make a copy of the session that is used exclusively with this archiver instance
        new_session_file = os.path.join("secrets/", f"telethon-{time.strftime('%Y-%m-%d')}{random_str(8)}.session")
        shutil.copy(self.session_file + ".session", new_session_file)
-        self.session_file = new_session_file
+        self.session_file = new_session_file.replace(".session", "")

        # initiate the client
        self.client = TelegramClient(self.session_file, self.api_id, self.api_hash)
@@ -101,8 +101,9 @@ class TelethonArchiver(Archiver):

    def cleanup(self) -> None:
        logger.info(f"CLEANUP {self.name}.")
-        if os.path.exists(self.session_file):
-            os.remove(self.session_file)
+        session_file_name = self.session_file + ".session"
+        if os.path.exists(session_file_name):
+            os.remove(session_file_name)

    def download(self, item: Metadata) -> Metadata:
        """
--- a/src/auto_archiver/archivers/twitter_archiver.py
+++ b/src/auto_archiver/archivers/twitter_archiver.py
@@ -1,7 +1,10 @@
 import re, requests, mimetypes, json
+from typing import Union
 from datetime import datetime
 from loguru import logger
 from snscrape.modules.twitter import TwitterTweetScraper, Video, Gif, Photo
+from yt_dlp import YoutubeDL
+from yt_dlp.extractor.twitter import TwitterIE
 from slugify import slugify

 from . import Archiver
@@ -29,7 +32,7 @@ class TwitterArchiver(Archiver):
        # expand URL if t.co and clean tracker GET params
        if 'https://t.co/' in url:
            try:
-                r = requests.get(url)
+                r = requests.get(url, timeout=30)
                logger.debug(f'Expanded url {url} to {r.url}')
                url = r.url
            except:
@@ -43,19 +46,31 @@ class TwitterArchiver(Archiver):
        can handle private/public channels
        """
        url = item.get_url()
-        # detect URLs that we definitely cannot handle
        username, tweet_id = self.get_username_tweet_id(url)
        if not username: return False

-        result = Metadata()
+        strategies = [self.download_yt_dlp, self.download_snscrape, self.download_syndication]
+        for strategy in strategies:
+            logger.debug(f"Trying {strategy.__name__} for {url=}")
+            try:
+                result = strategy(item, url, tweet_id)
+                if result: return result
+            except Exception as ex:
+                logger.error(f"Failed to download {url} with {strategy.__name__}: {type(ex).__name__} occurred. args: {ex.args}")
+        
+        logger.warning(f"No free strategy worked for {url}")
+        return False

+        
+    def download_snscrape(self, item: Metadata, url: str, tweet_id: str) -> Union[Metadata|bool]:
        scr = TwitterTweetScraper(tweet_id)
        try:
            tweet = next(scr.get_items())
        except Exception as ex:
-            logger.warning(f"can't get tweet: {type(ex).__name__} occurred. args: {ex.args}")
-            return self.download_alternative(item, url, tweet_id)
-
+            logger.warning(f"SNSCRAPE FAILED, can't get tweet: {type(ex).__name__} occurred. args: {ex.args}")
+            return False
+        
+        result = Metadata()
        result.set_title(tweet.content).set_content(tweet.json()).set_timestamp(tweet.date)
        if tweet.media is None:
            logger.debug(f'No media found, archiving tweet text only')
@@ -85,7 +100,7 @@ class TwitterArchiver(Archiver):

        return result.success("twitter-snscrape")

-    def download_alternative(self, item: Metadata, url: str, tweet_id: str) -> Metadata:
+    def download_syndication(self, item: Metadata, url: str, tweet_id: str) -> Union[Metadata|bool]:
        """
        Hack alternative working again.
        https://stackoverflow.com/a/71867055/6196010 (OUTDATED URL)
@@ -93,12 +108,13 @@ class TwitterArchiver(Archiver):
        next to test: https://cdn.embedly.com/widgets/media.html?&schema=twitter&url=https://twitter.com/bellingcat/status/1674700676612386816
        """

-        logger.debug(f"Trying twitter hack for {url=}")
-        result = Metadata()
-
        hack_url = f"https://cdn.syndication.twimg.com/tweet-result?id={tweet_id}"
        r = requests.get(hack_url)
-        if r.status_code != 200: return False
+        if r.status_code != 200 or r.json()=={}: 
+            logger.warning(f"SyndicationHack: Failed to get tweet information from {hack_url}.")
+            return False
+        
+        result = Metadata()
        tweet = r.json()

        urls = []
@@ -108,7 +124,7 @@ class TwitterArchiver(Archiver):
        # 1 tweet has 1 video max
        if "video" in tweet:
            v = tweet["video"]
-            urls.append(self.choose_variant(v.get("variants", [])))
+            urls.append(self.choose_variant(v.get("variants", []))['url'])

        logger.debug(f"Twitter hack got {urls=}")

@@ -124,7 +140,39 @@ class TwitterArchiver(Archiver):
            result.add_media(media)

        result.set_title(tweet.get("text")).set_content(json.dumps(tweet, ensure_ascii=False)).set_timestamp(datetime.strptime(tweet["created_at"], "%Y-%m-%dT%H:%M:%S.%fZ"))
-        return result.success("twitter-hack")
+        return result.success("twitter-syndication")
+
+    def download_yt_dlp(self, item: Metadata, url: str, tweet_id: str) -> Union[Metadata|bool]:
+        downloader = YoutubeDL()
+        tie = TwitterIE(downloader)
+        tweet = tie._extract_status(tweet_id)
+        result = Metadata()
+        result\
+            .set_title(tweet.get('full_text', ''))\
+            .set_content(json.dumps(tweet, ensure_ascii=False))\
+            .set_timestamp(datetime.strptime(tweet["created_at"], "%a %b %d %H:%M:%S %z %Y"))
+        if not tweet.get("entities", {}).get("media"):
+            logger.debug('No media found, archiving tweet text only')
+            result.status = "twitter-ytdl"
+            return result
+        for i, tw_media in enumerate(tweet["entities"]["media"]):
+            media = Media(filename="")
+            mimetype = ""
+            if tw_media["type"] == "photo":
+                media.set("src", UrlUtil.twitter_best_quality_url(tw_media['media_url_https']))
+                mimetype = "image/jpeg"
+            elif tw_media["type"] == "video":
+                variant = self.choose_variant(tw_media['video_info']['variants'])
+                media.set("src", variant['url'])
+                mimetype = variant['content_type']
+            elif tw_media["type"] == "animated_gif":
+                variant = tw_media['video_info']['variants'][0]
+                media.set("src", variant['url'])
+                mimetype = variant['content_type']
+            ext = mimetypes.guess_extension(mimetype)
+            media.filename = self.download_from_url(media.get("src"), f'{slugify(url)}_{i}{ext}', item)
+            result.add_media(media)
+        return result.success("twitter-ytdl")

    def get_username_tweet_id(self, url):
        # detect URLs that we definitely cannot handle
@@ -140,13 +188,13 @@ class TwitterArchiver(Archiver):
        # choosing the highest quality possible
        variant, width, height = None, 0, 0
        for var in variants:
-            if var.get("type", "") == "video/mp4":
-                width_height = re.search(r"\/(\d+)x(\d+)\/", var["src"])
+            if var.get("content_type", "") == "video/mp4":
+                width_height = re.search(r"\/(\d+)x(\d+)\/", var["url"])
                if width_height:
                    w, h = int(width_height[1]), int(width_height[2])
                    if w > width or h > height:
                        width, height = w, h
-                        variant = var.get("src", variant)
+                        variant = var
            else:
-                variant = var.get("src") if not variant else variant
+                variant = var if not variant else variant
        return variant
--- a/src/auto_archiver/archivers/youtubedl_archiver.py
+++ b/src/auto_archiver/archivers/youtubedl_archiver.py
@@ -56,7 +56,8 @@ class YoutubeDLArchiver(Archiver):
            return False

        # this time download
-        ydl = yt_dlp.YoutubeDL({**ydl_options, "getcomments": self.comments}) 
+        ydl = yt_dlp.YoutubeDL({**ydl_options, "getcomments": self.comments})
+        #TODO: for playlist or long lists of videos, how to download one at a time so they can be stored before the next one is downloaded?
        info = ydl.extract_info(url, download=True)

        if "entries" in info:
@@ -97,11 +98,12 @@ class YoutubeDLArchiver(Archiver):
            result.set("comments", [{
                "text": c["text"],
                "author": c["author"], 
-                "timestamp": datetime.datetime.utcfromtimestamp(c.get("timestamp")).replace(tzinfo=datetime.timezone.utc)
+                "timestamp": datetime.datetime.fromtimestamp(c.get("timestamp"), tz = datetime.timezone.utc)
            } for c in info.get("comments", [])])

        if (timestamp := info.get("timestamp")):
-            timestamp = datetime.datetime.utcfromtimestamp(timestamp).replace(tzinfo=datetime.timezone.utc).isoformat()
+            #TODO: fix deprecated timestamp, 
+            timestamp = datetime.datetime.fromtimestamp(timestamp, tz = datetime.timezone.utc).isoformat()
            result.set_timestamp(timestamp)
        if (upload_date := info.get("upload_date")):
            upload_date = datetime.datetime.strptime(upload_date, '%Y%m%d').replace(tzinfo=datetime.timezone.utc)
--- a/src/auto_archiver/core/media.py
+++ b/src/auto_archiver/core/media.py
@@ -25,10 +25,11 @@ class Media:
    _mimetype: str = None  # eg: image/jpeg
    _stored: bool = field(default=False, repr=False, metadata=config(exclude=lambda _: True))  # always exclude

-    def store(self: Media, override_storages: List = None, url: str = "url-not-available"):
-        # stores the media into the provided/available storages [Storage]
-        # repeats the process for its properties, in case they have inner media themselves
-        # for now it only goes down 1 level but it's easy to make it recursive if needed
+    def store(self: Media, override_storages: List = None, url: str = "url-not-available", metadata: Any = None):
+        # 'Any' typing for metadata to avoid circular imports. Stores the media
+        # into the provided/available storages [Storage] repeats the process for
+        # its properties, in case they have inner media themselves for now it
+        # only goes down 1 level but it's easy to make it recursive if needed.
        storages = override_storages or ArchivingContext.get("storages")
        if not len(storages):
            logger.warning(f"No storages found in local context or provided directly for {self.filename}.")
@@ -36,7 +37,7 @@ class Media:

        for s in storages:
            for any_media in self.all_inner_media(include_self=True):
-                s.store(any_media, url)
+                s.store(any_media, url, metadata=metadata)

    def all_inner_media(self, include_self=False):
        """ Media can be inside media properties, examples include transformations on original media.
--- a/src/auto_archiver/core/metadata.py
+++ b/src/auto_archiver/core/metadata.py
@@ -48,7 +48,7 @@ class Metadata:
        self.remove_duplicate_media_by_hash()
        storages = override_storages or ArchivingContext.get("storages")
        for media in self.media:
-            media.store(override_storages=storages, url=self.get_url())
+            media.store(override_storages=storages, url=self.get_url(), metadata=self)

    def set(self, key: str, val: Any) -> Metadata:
        self.metadata[key] = val
--- a/src/auto_archiver/core/orchestrator.py
+++ b/src/auto_archiver/core/orchestrator.py
@@ -1,5 +1,7 @@
 from __future__ import annotations
 from typing import Generator, Union, List
+from urllib.parse import urlparse
+from ipaddress import ip_address

 from .context import ArchivingContext

@@ -26,7 +28,7 @@ class ArchivingOrchestrator:
        ArchivingContext.set("storages", self.storages, keep_on_reset=True)

        try: 
-            for a in self.archivers: a.setup()
+            for a in self.all_archivers_for_setup(): a.setup()
        except (KeyboardInterrupt, Exception) as e:
            logger.error(f"Error during setup of archivers: {e}\n{traceback.format_exc()}")
            self.cleanup()
@@ -34,7 +36,7 @@ class ArchivingOrchestrator:

    def cleanup(self)->None:
        logger.info("Cleaning up")
-        for a in self.archivers: a.cleanup()
+        for a in self.all_archivers_for_setup(): a.cleanup()

    def feed(self) -> Generator[Metadata]:
        for item in self.feeder:
@@ -60,7 +62,9 @@ class ArchivingOrchestrator:
            exit()
        except Exception as e:
            logger.error(f'Got unexpected error on item {item}: {e}\n{traceback.format_exc()}')
-            for d in self.databases: d.failed(item)
+            for d in self.databases:
+                if type(e) == AssertionError: d.failed(item, str(e))
+                else: d.failed(item)


    def archive(self, result: Metadata) -> Union[Metadata, None]:
@@ -73,7 +77,8 @@ class ArchivingOrchestrator:
            5. Store all downloaded/generated media
            6. Call selected Formatter and store formatted if needed
        """
-        original_url = result.get_url()
+        original_url = result.get_url().strip()
+        self.assert_valid_url(original_url)

        # 1 - sanitize - each archiver is responsible for cleaning/expanding its own URLs
        url = original_url
@@ -115,7 +120,7 @@ class ArchivingOrchestrator:

        # 6 - format and store formatted if needed
        if (final_media := self.formatter.format(result)):
-            final_media.store(url=url)
+            final_media.store(url=url, metadata=result)
            result.set_final_media(final_media)

        if result.is_empty():
@@ -128,3 +133,26 @@ class ArchivingOrchestrator:
                logger.error(f"ERROR database {d.name}: {e}: {traceback.format_exc()}")

        return result
+
+    def assert_valid_url(self, url: str) -> bool:
+        """
+        Blocks localhost, private, reserved, and link-local IPs and all non-http/https schemes.
+        """
+        assert url.startswith("http://") or url.startswith("https://"), f"Invalid URL scheme"
+        
+        parsed = urlparse(url)
+        assert parsed.scheme in ["http", "https"], f"Invalid URL scheme"
+        assert parsed.hostname, f"Invalid URL hostname"
+        assert parsed.hostname != "localhost", f"Invalid URL"
+
+        try: # special rules for IP addresses
+            ip = ip_address(parsed.hostname)
+        except ValueError: pass
+        else:
+            assert ip.is_global, f"Invalid IP used"
+            assert not ip.is_reserved, f"Invalid IP used"
+            assert not ip.is_link_local, f"Invalid IP used"
+            assert not ip.is_private, f"Invalid IP used"
+
+    def all_archivers_for_setup(self) -> List[Archiver]:
+        return self.archivers + [e for e in self.enrichers if isinstance(e, Archiver)]
--- a/src/auto_archiver/databases/init.py
+++ b/src/auto_archiver/databases/init.py
@@ -2,4 +2,5 @@ from .database import Database
 from .gsheet_db import GsheetsDb
 from .console_db import ConsoleDb
 from .csv_db import CSVDb
-from .api_db import AAApiDb
+from .api_db import AAApiDb
+from .atlos_db import AtlosDb
--- a/src/auto_archiver/databases/atlos_db.py
+++ b/src/auto_archiver/databases/atlos_db.py
@@ -0,0 +1,79 @@
+import os
+from typing import Union
+from loguru import logger
+from csv import DictWriter
+from dataclasses import asdict
+import requests
+
+from . import Database
+from ..core import Metadata
+from ..utils import get_atlos_config_options
+
+
+class AtlosDb(Database):
+    """
+    Outputs results to Atlos
+    """
+
+    name = "atlos_db"
+
+    def __init__(self, config: dict) -> None:
+        # without this STEP.__init__ is not called
+        super().__init__(config)
+
+    @staticmethod
+    def configs() -> dict:
+        return get_atlos_config_options()
+
+    def failed(self, item: Metadata, reason: str) -> None:
+        """Update DB accordingly for failure"""
+        # If the item has no Atlos ID, there's nothing for us to do
+        if not item.metadata.get("atlos_id"):
+            logger.info(f"Item {item.get_url()} has no Atlos ID, skipping")
+            return
+
+        requests.post(
+            f"{self.atlos_url}/api/v2/source_material/metadata/{item.metadata['atlos_id']}/auto_archiver",
+            headers={"Authorization": f"Bearer {self.api_token}"},
+            json={"metadata": {"processed": True, "status": "error", "error": reason}},
+        ).raise_for_status()
+        logger.info(
+            f"Stored failure for {item.get_url()} (ID {item.metadata['atlos_id']}) on Atlos: {reason}"
+        )
+
+    def fetch(self, item: Metadata) -> Union[Metadata, bool]:
+        """check and fetch if the given item has been archived already, each
+        database should handle its own caching, and configuration mechanisms"""
+        return False
+
+    def _process_metadata(self, item: Metadata) -> dict:
+        """Process metadata for storage on Atlos. Will convert any datetime
+        objects to ISO format."""
+
+        return {
+            k: v.isoformat() if hasattr(v, "isoformat") else v
+            for k, v in item.metadata.items()
+        }
+
+    def done(self, item: Metadata, cached: bool = False) -> None:
+        """archival result ready - should be saved to DB"""
+
+        if not item.metadata.get("atlos_id"):
+            logger.info(f"Item {item.get_url()} has no Atlos ID, skipping")
+            return
+
+        requests.post(
+            f"{self.atlos_url}/api/v2/source_material/metadata/{item.metadata['atlos_id']}/auto_archiver",
+            headers={"Authorization": f"Bearer {self.api_token}"},
+            json={
+                "metadata": dict(
+                    processed=True,
+                    status="success",
+                    results=self._process_metadata(item),
+                )
+            },
+        ).raise_for_status()
+
+        logger.info(
+            f"Stored success for {item.get_url()} (ID {item.metadata['atlos_id']}) on Atlos"
+        )
--- a/src/auto_archiver/databases/console_db.py
+++ b/src/auto_archiver/databases/console_db.py
@@ -21,8 +21,8 @@ class ConsoleDb(Database):
    def started(self, item: Metadata) -> None:
        logger.warning(f"STARTED {item}")

-    def failed(self, item: Metadata) -> None:
-        logger.error(f"FAILED {item}")
+    def failed(self, item: Metadata, reason:str) -> None:
+        logger.error(f"FAILED {item}: {reason}")

    def aborted(self, item: Metadata) -> None:
        logger.warning(f"ABORTED {item}")
--- a/src/auto_archiver/databases/database.py
+++ b/src/auto_archiver/databases/database.py
@@ -22,7 +22,7 @@ class Database(Step, ABC):
        """signals the DB that the given item archival has started"""
        pass

-    def failed(self, item: Metadata) -> None:
+    def failed(self, item: Metadata, reason:str) -> None:
        """update DB accordingly for failure"""
        pass

--- a/src/auto_archiver/databases/gsheet_db.py
+++ b/src/auto_archiver/databases/gsheet_db.py
@@ -29,9 +29,9 @@ class GsheetsDb(Database):
        gw, row = self._retrieve_gsheet(item)
        gw.set_cell(row, 'status', 'Archive in progress')

-    def failed(self, item: Metadata) -> None:
+    def failed(self, item: Metadata, reason:str) -> None:
        logger.error(f"FAILED {item}")
-        self._safe_status_update(item, 'Archive failed')
+        self._safe_status_update(item, f'Archive failed {reason}')

    def aborted(self, item: Metadata) -> None:
        logger.warning(f"ABORTED {item}")
@@ -102,6 +102,11 @@ class GsheetsDb(Database):

    def _retrieve_gsheet(self, item: Metadata) -> Tuple[GWorksheet, int]:
        # TODO: to make gsheet_db less coupled with gsheet_feeder's "gsheet" parameter, this method could 1st try to fetch "gsheet" from ArchivingContext and, if missing, manage its own singleton - not needed for now
-        gw: GWorksheet = ArchivingContext.get("gsheet").get("worksheet")
-        row: int = ArchivingContext.get("gsheet").get("row")
+        if gsheet := ArchivingContext.get("gsheet"):
+            gw: GWorksheet = gsheet.get("worksheet")
+            row: int = gsheet.get("row")
+        elif self.sheet_id:
+            print(self.sheet_id)
+
+
        return gw, row
--- a/src/auto_archiver/enrichers/ssl_enricher.py
+++ b/src/auto_archiver/enrichers/ssl_enricher.py
@@ -27,7 +27,10 @@ class SSLEnricher(Enricher):
        if not to_enrich.media and self.skip_when_nothing_archived: return
        
        url = to_enrich.get_url()
-        domain = urlparse(url).netloc
+        parsed = urlparse(url)
+        assert parsed.scheme in ["https"], f"Invalid URL scheme {url=}"
+        
+        domain = parsed.netloc
        logger.debug(f"fetching SSL certificate for {domain=} in {url=}")

        cert = ssl.get_server_certificate((domain, 443))
--- a/src/auto_archiver/enrichers/wacz_enricher.py
+++ b/src/auto_archiver/enrichers/wacz_enricher.py
@@ -75,14 +75,16 @@ class WaczArchiverEnricher(Enricher, Archiver):
            "--url", url,
            "--scopeType", "page",
            "--generateWACZ",
-            "--text",
+            "--text", "to-pages",
            "--screenshot", "fullPage",
            "--collection", collection,
            "--id", collection,
            "--saveState", "never",
            "--behaviors", "autoscroll,autoplay,autofetch,siteSpecific",
            "--behaviorTimeout", str(self.timeout),
-            "--timeout", str(self.timeout)]
+            "--timeout", str(self.timeout),
+            "--blockAds" # TODO: test
+        ]
        
        if self.docker_in_docker:
            cmd.extend(["--cwd", self.cwd_dind])
@@ -110,9 +112,9 @@ class WaczArchiverEnricher(Enricher, Archiver):

        try:
            logger.info(f"Running browsertrix-crawler: {' '.join(cmd)}")
+            my_env = os.environ.copy()
            if self.socks_proxy_host and self.socks_proxy_port:
                logger.debug("Using SOCKS proxy for browsertrix-crawler")
-                my_env = os.environ.copy()
                my_env["SOCKS_HOST"] = self.socks_proxy_host
                my_env["SOCKS_PORT"] = str(self.socks_proxy_port)
            subprocess.run(cmd, check=True, env=my_env)
@@ -161,7 +163,7 @@ class WaczArchiverEnricher(Enricher, Archiver):
        """
        Receives a .wacz archive, and extracts all relevant media from it, adding them to to_enrich.
        """
-        logger.info(f"WACZ extract_media flag is set, extracting media from {wacz_filename=}")
+        logger.info(f"WACZ extract_media or extract_screenshot flag is set, extracting media from {wacz_filename=}")

        # unzipping the .wacz
        tmp_dir = ArchivingContext.get_tmp_dir()
@@ -182,10 +184,11 @@ class WaczArchiverEnricher(Enricher, Archiver):
        # get media out of .warc
        counter = 0
        seen_urls = set()
+        import json
        with open(warc_filename, 'rb') as warc_stream:
            for record in ArchiveIterator(warc_stream):
                # only include fetched resources
-                if record.rec_type == "resource" and self.extract_screenshot:  # screenshots
+                if record.rec_type == "resource" and record.content_type == "image/png" and self.extract_screenshot:  # screenshots
                    fn = os.path.join(tmp_dir, f"warc-file-{counter}.png")
                    with open(fn, "wb") as outf: outf.write(record.raw_stream.read())
                    m = Media(filename=fn)
@@ -231,4 +234,4 @@ class WaczArchiverEnricher(Enricher, Archiver):
                to_enrich.add_media(m, warc_fn)
                counter += 1
                seen_urls.add(record_url)
-        logger.info(f"WACZ extract_media finished, found {counter} relevant media file(s)")
+        logger.info(f"WACZ extract_media/extract_screenshot finished, found {counter} relevant media file(s)")
--- a/src/auto_archiver/enrichers/whisper_enricher.py
+++ b/src/auto_archiver/enrichers/whisper_enricher.py
@@ -44,7 +44,7 @@ class WhisperEnricher(Enricher):
        job_results = {}
        for i, m in enumerate(to_enrich.media):
            if m.is_video() or m.is_audio():
-                m.store(url=url)
+                m.store(url=url, metadata=to_enrich)
                try:
                    job_id = self.submit_job(m)
                    job_results[job_id] = False
--- a/src/auto_archiver/feeders/init.py
+++ b/src/auto_archiver/feeders/init.py
@@ -1,3 +1,4 @@
 from.feeder import Feeder
 from .gsheet_feeder import GsheetsFeeder
-from .cli_feeder import CLIFeeder
+from .cli_feeder import CLIFeeder
+from .atlos_feeder import AtlosFeeder
--- a/src/auto_archiver/feeders/atlos_feeder.py
+++ b/src/auto_archiver/feeders/atlos_feeder.py
@@ -0,0 +1,56 @@
+from loguru import logger
+import requests
+
+from . import Feeder
+from ..core import Metadata, ArchivingContext
+from ..utils import get_atlos_config_options
+
+
+class AtlosFeeder(Feeder):
+    name = "atlos_feeder"
+
+    def __init__(self, config: dict) -> None:
+        # without this STEP.__init__ is not called
+        super().__init__(config)
+        if type(self.api_token) != str:
+            raise Exception("Atlos Feeder did not receive an Atlos API token")
+
+    @staticmethod
+    def configs() -> dict:
+        return get_atlos_config_options()
+
+    def __iter__(self) -> Metadata:
+        # Get all the urls from the Atlos API
+        count = 0
+        cursor = None
+        while True:
+            response = requests.get(
+                f"{self.atlos_url}/api/v2/source_material",
+                headers={"Authorization": f"Bearer {self.api_token}"},
+                params={"cursor": cursor},
+            )
+            data = response.json()
+            response.raise_for_status()
+            cursor = data["next"]
+
+            for item in data["results"]:
+                if (
+                    item["source_url"] not in [None, ""]
+                    and (
+                        item["metadata"]
+                        .get("auto_archiver", {})
+                        .get("processed", False)
+                        != True
+                    )
+                    and item["visibility"] == "visible"
+                    and item["status"] not in ["processing", "pending"]
+                ):
+                    yield Metadata().set_url(item["source_url"]).set(
+                        "atlos_id", item["id"]
+                    )
+                    count += 1
+
+            if len(data["results"]) == 0 or cursor is None:
+                break
+
+        logger.success(f"Processed {count} URL(s)")
--- a/src/auto_archiver/formatters/html_formatter.py
+++ b/src/auto_archiver/formatters/html_formatter.py
@@ -21,7 +21,7 @@ class HtmlFormatter(Formatter):
    def __init__(self, config: dict) -> None:
        # without this STEP.__init__ is not called
        super().__init__(config)
-        self.environment = Environment(loader=FileSystemLoader(os.path.join(pathlib.Path(__file__).parent.resolve(), "templates/")))
+        self.environment = Environment(loader=FileSystemLoader(os.path.join(pathlib.Path(__file__).parent.resolve(), "templates/")), autoescape=True)
        # JinjaHelper class static methods are added as filters
        self.environment.filters.update({
            k: v.__func__ for k, v in JinjaHelpers.__dict__.items() if isinstance(v, staticmethod)
--- a/src/auto_archiver/storages/init.py
+++ b/src/auto_archiver/storages/init.py
@@ -1,4 +1,5 @@
 from .storage import Storage
 from .s3 import S3Storage
 from .local import LocalStorage
-from .gd import GDriveStorage
+from .gd import GDriveStorage
+from .atlos import AtlosStorage
--- a/src/auto_archiver/storages/atlos.py
+++ b/src/auto_archiver/storages/atlos.py
@@ -0,0 +1,74 @@
+import os
+from typing import IO, List, Optional
+from loguru import logger
+import requests
+import hashlib
+
+from ..core import Media, Metadata
+from ..storages import Storage
+from ..utils import get_atlos_config_options
+
+
+class AtlosStorage(Storage):
+    name = "atlos_storage"
+
+    def __init__(self, config: dict) -> None:
+        super().__init__(config)
+
+    @staticmethod
+    def configs() -> dict:
+        return dict(Storage.configs(), **get_atlos_config_options())
+
+    def get_cdn_url(self, _media: Media) -> str:
+        # It's not always possible to provide an exact URL, because it's
+        # possible that the media once uploaded could have been copied to
+        # another project.
+        return self.atlos_url
+    
+    def _hash(self, media: Media) -> str:
+        # Hash the media file using sha-256. We don't use the existing auto archiver
+        # hash because there's no guarantee that the configuerer is using sha-256, which
+        # is how Atlos hashes files.
+
+        sha256 = hashlib.sha256()
+        with open(media.filename, "rb") as f:
+            while True:
+                buf = f.read(4096)
+                if not buf: break
+                sha256.update(buf)
+        return sha256.hexdigest()
+
+    def upload(self, media: Media, metadata: Optional[Metadata]=None, **_kwargs) -> bool:
+        atlos_id = metadata.get("atlos_id")
+        if atlos_id is None:
+            logger.error(f"No Atlos ID found in metadata; can't store {media.filename} on Atlos")
+            return False
+        
+        media_hash = self._hash(media)
+        
+        # Check whether the media has already been uploaded
+        source_material = requests.get(
+            f"{self.atlos_url}/api/v2/source_material/{atlos_id}",
+            headers={"Authorization": f"Bearer {self.api_token}"},
+        ).json()["result"]
+        existing_media = [x["file_hash_sha256"] for x in source_material.get("artifacts", [])]
+        if media_hash in existing_media:
+            logger.info(f"{media.filename} with SHA256 {media_hash} already uploaded to Atlos")
+            return True
+        
+        # Upload the media to the Atlos API
+        requests.post(
+            f"{self.atlos_url}/api/v2/source_material/upload/{atlos_id}",
+            headers={"Authorization": f"Bearer {self.api_token}"},
+            params={
+                "title": media.properties
+            },
+            files={"file": (os.path.basename(media.filename), open(media.filename, "rb"))},
+        ).raise_for_status()
+
+        logger.info(f"Uploaded {media.filename} to Atlos with ID {atlos_id} and title {media.key}")
+        
+        return True
+
+    # must be implemented even if unused
+    def uploadf(self, file: IO[bytes], key: str, **kwargs: dict) -> bool: pass
--- a/src/auto_archiver/storages/storage.py
+++ b/src/auto_archiver/storages/storage.py
@@ -1,12 +1,12 @@
 from __future__ import annotations
 from abc import abstractmethod
 from dataclasses import dataclass
-from typing import IO
+from typing import IO, Optional
 import os

 from ..utils.misc import random_str

-from ..core import Media, Step, ArchivingContext
+from ..core import Media, Step, ArchivingContext, Metadata
 from ..enrichers import HashEnricher
 from loguru import logger
 from slugify import slugify
@@ -43,12 +43,12 @@ class Storage(Step):
        # only for typing...
        return Step.init(name, config, Storage)

-    def store(self, media: Media, url: str) -> None:
+    def store(self, media: Media, url: str, metadata: Optional[Metadata]=None) -> None:
        if media.is_stored(): 
            logger.debug(f"{media.key} already stored, skipping")
            return
        self.set_key(media, url)
-        self.upload(media)
+        self.upload(media, metadata=metadata)
        media.add_url(self.get_cdn_url(media))

    @abstractmethod
--- a/src/auto_archiver/utils/init.py
+++ b/src/auto_archiver/utils/init.py
@@ -3,4 +3,5 @@ from .gworksheet import GWorksheet
 from .misc import *
 from .webdriver import Webdriver
 from .gsheet import Gsheets
-from .url import UrlUtil
+from .url import UrlUtil
+from .atlos import get_atlos_config_options
--- a/src/auto_archiver/utils/atlos.py
+++ b/src/auto_archiver/utils/atlos.py
@@ -0,0 +1,13 @@
+def get_atlos_config_options():
+    return {
+        "api_token": {
+            "default": None,
+            "help": "An Atlos API token. For more information, see https://docs.atlos.org/technical/api/",
+            "cli_set": lambda cli_val, _: cli_val
+        },
+        "atlos_url": {
+            "default": "https://platform.atlos.org",
+            "help": "The URL of your Atlos instance (e.g., https://platform.atlos.org), without a trailing slash.",
+            "cli_set": lambda cli_val, _: cli_val
+        },
+    }
--- a/src/auto_archiver/version.py
+++ b/src/auto_archiver/version.py
@@ -1,9 +1,9 @@

 _MAJOR = "0"
-_MINOR = "9"
+_MINOR = "11"
 # On main and in a nightly release the patch should be one ahead of the last
 # released build.
-_PATCH = "11"
+_PATCH = "5"
 # This is mainly for nightly builds which have the suffix ".dev$DATE". See
 # https://semver.org/#is-v123-a-semantic-version for the semantics.
 _SUFFIX = ""
--- a/todo.md
+++ b/todo.md
@@ -0,0 +1,72 @@
+------ AA + API
+
+
+
+2024-03-05 11:57:12.910 | ERROR    | auto_archiver.core.orchestrator:archive:116 - ERROR enricher wacz_archiver_enricher: 'WaczArchiverEnricher' object has no attri bute 'browsertrix_home_host': Traceback (most recent call last):
+  File "/root/.local/share/virtualenvs/app-4PlAip0Q/lib/python3.10/site-packages/auto_archiver/core/orchestrator.py", line 114, in archive
+    try: e.enrich(result)
+  File "/root/.local/share/virtualenvs/app-4PlAip0Q/lib/python3.10/site-packages/auto_archiver/enrichers/wacz_enricher.py", line 70, in enrich
+    browsertrix_home_host = self.browsertrix_home_host or os.path.abspath(ArchivingContext.get_tmp_dir())
+AttributeError: 'WaczArchiverEnricher' object has no attribute 'browsertrix_home_host'
+
+
+
+-------- API
+
+
+2024-02-29 17:12:06.078 | WARNING  | worker:task_failure_notifier:100 - 😅 From task_failure_notifier ==> Task failed successfully! 
+2024-02-29 17:12:06.078 | ERROR    | worker:task_failure_notifier:101 - list index out of range
+2024-02-29 17:12:06.078 | ERROR    | worker:task_failure_notifier:102 - <traceback object at 0x7f3db75446c0>
+2024-02-29 17:12:06.079 | ERROR    | worker:task_failure_notifier:103 -   File "/root/.local/share/virtualenvs/app-4PlAip0Q/lib/python3.10/site-packages/celery/app/trace.py", line 412, in trace_task
+    R = retval = fun(*args, **kwargs)
+
+  File "/root/.local/share/virtualenvs/app-4PlAip0Q/lib/python3.10/site-packages/celery/app/trace.py", line 704, in __protected_call__
+    return self.run(*args, **kwargs)
+
+  File "/root/.local/share/virtualenvs/app-4PlAip0Q/lib/python3.10/site-packages/celery/app/autoretry.py", line 50, in run
+    raise task.retry(exc=exc, **retry_kwargs)
+
+  File "/root/.local/share/virtualenvs/app-4PlAip0Q/lib/python3.10/site-packages/celery/app/task.py", line 706, in retry
+    raise_with_context(exc)
+
+  File "/root/.local/share/virtualenvs/app-4PlAip0Q/lib/python3.10/site-packages/celery/app/autoretry.py", line 35, in run
+    return task._orig_run(*args, **kwargs)
+
+  File "/app/worker.py", line 35, in create_archive_task
+    invalid = is_group_invalid_for_user(archive.public, archive.group_id, archive.author_id)
+
+  File "/app/worker.py", line 160, in is_group_invalid_for_user
+    if not crud.is_user_in_group(session, group_id, author_id):
+
+  File "/app/db/crud.py", line 93, in is_user_in_group
+    return len(group_name) and len(email) and group_name in get_user_groups(db, email)
+
+  File "/app/db/crud.py", line 103, in get_user_groups
+    domain_level_groups = DOMAIN_GROUPS.get(email.split('@')[1], [])
+
+------------------ API
+
+[parameters: (('https://bellingcat-archive.nyc3.cdn.digitaloceanspaces.com/no-dups/b0c88017bb047ff43fc49907/3811d9d0c74541929f4a72d0.jpg', '314b2eb3-98a4-4a4b-be01-7f4d3ea1c4c2', 'profile_picture'), ('https://bellingcat-archive.nyc3.cdn.digitaloceanspaces.com/no-dups/ff30ece740738d060229c5da/e43172422e274c2a8f9529ff.jpg', '314b2eb3-98a4-4a4b-be01-7f4d3ea1c4c2', 'post 3308982791113602520'), ('https://bellingcat-archive.nyc3.cdn.digitaloceanspaces.com/no-dups/23f218c518e2d5a17fe856bd/bad85f53a8e54c26991cdff9.jpg', '314b2eb3-98a4-4a4b-be01-7f4d3ea1c4c2', 'image 3308982790987757405'), ('https://bellingcat-archive.nyc3.cdn.digitaloceanspaces.com/no-dups/2ffc8c65d6bfec7ef5402bda/520a10e7a7e14028be1cc1c8.jpg', '314b2eb3-98a4-4a4b-be01-7f4d3ea1c4c2', 'image 3308982790970975889'), ('https://bellingcat-archive.nyc3.cdn.digitaloceanspaces.com/no-dups/da0486669cbb102e6221d94c/65b151ee59114ea5b61cfe96.jpg', '314b2eb3-98a4-4a4b-be01-7f4d3ea1c4c2', 'image 3308982790979533474'), ('https://bellingcat-archive.nyc3.cdn.digitaloceanspaces.com/no-dups/d8c4db3247780324b6fa6d4a/df195ff24b104182bf255610.jpg', '314b2eb3-98a4-4a4b-be01-7f4d3ea1c4c2', 'image 3308982790979331432'), ('https://bellingcat-archive.nyc3.cdn.digitaloceanspaces.com/no-dups/715432d8038abe50c8c89994/f49554bf621848e5881388ee.jpg', '314b2eb3-98a4-4a4b-be01-7f4d3ea1c4c2', 'image 3308982791122025152'), ('https://bellingcat-archive.nyc3.cdn.digitaloceanspaces.com/no-dups/f143ce41b1eb329a0c404448/6b3d604f676c4ef583b54ff3.jpg', '314b2eb3-98a4-4a4b-be01-7f4d3ea1c4c2', 'image 3308982790987824546')  ... displaying 10 of 253 total bound parameter sets ...  ('https://bellingcat-archive.nyc3.cdn.digitaloceanspaces.com/no-dups/385ac59684e1148643c762de/0d72d23e6652474a977fe1d6', '314b2eb3-98a4-4a4b-be01-7f4d3ea1c4c2', 'timestamp_authority_filesno-dups/385ac59684e1148643c762de/0d72d23e6652474a977fe1d6_3.0'), ('https://bellingcat-archive.nyc3.cdn.digitaloceanspaces.com/no-dups/261cb9cefe3c373ad5f7f305/545253c297124d31bbf817c9.html', '314b2eb3-98a4-4a4b-be01-7f4d3ea1c4c2', '_final_media'))]
+
+
+2024-02-25 15:43:13.142 | DEBUG    | worker:convert_if_media:200 - error parsing {'pk': '2045172809601551458', 'id': '2045172809601551458_178884643', 'code': 'Bxh6dWkloxi', 'taken_at': '2019-05-16T16:20:16Z', 'media_type': 1, 'product_type': 'story', 'thumbnail_url': 'https://scontent-sjc3-1.cdninstagram.com/v/t51.12442-15/58604296_289070981969290_2714055836620897014_n.jpg?stp=dst-jpg_e35&efg=eyJ2ZW5jb2RlX3RhZyI6ImltYWdlX3VybGdlbi4xMDI0eDE4MjAuc2RyIn0&_nc_ht=scontent-sjc3-1.cdninstagram.com&_nc_cat=110&_nc_ohc=ri2YWjVH4dkAX9TFQox&edm=ANmP7GQBAAAA&ccb=7-5&ig_cache_key=MjA0NTE3MjgwOTYwMTU1MTQ1OA%3D%3D.2-ccb7-5&oh=00_AfCIxHr9jkUmeq9NgbmTpWtURV_eu5JGMRbrsc0WwyO59g&oe=65DD0180&_nc_sid=982cc7', 'user': {'pk': '178884643'}, 'locations': [{'location': {'pk': 218723854, 'name': 'Montañita, Ecuador', 'lng': -80.751982661304, 'lat': -1.829127033905}}]} : 'filename'
+2024-02-25 15:43:13.224 | WARNING  | worker:create_sheet_task:84 - cached result detected: (sqlite3.IntegrityError) UNIQUE constraint failed: archive_urls.url, archive_urls.archive_id
+[SQL: INSERT INTO archive_urls (url, archive_id, "key") VALUES (?, ?, ?)]
+[parameters: (('https://bellingcat-archive.nyc3.cdn.digitaloceanspaces.com/no-dups/3caafdcb057000f8c70610fe/e45ec0b09b854333afa22c1c.jpg', '51fe261b-ad32-4c8a-96cd-801ccfa472fa', 'profile_picture'), ('https://bellingcat-archive.nyc3.cdn.digitaloceanspaces.com/no-dups/58f0b188e55ef64b96d04b35/94cd9b3d2d5c4deebb9ccd13.jpg', '51fe261b-ad32-4c8a-96cd-801ccfa472fa', 'story 3310558716812786504_178884643'), ('https://bellingcat-archive.nyc3.cdn.digitaloceanspaces.com/no-dups/9225487f1a559da4060dcf8e/fe55f26616114856a35cc357.jpg', '51fe261b-ad32-4c8a-96cd-801ccfa472fa', 'post 3301802228841572226'), ('https://bellingcat-archive.nyc3.cdn.digitaloceanspaces.com/no-dups/e26b8bc09aa4febc53d4a42d/50330faff71d4dc28078d5e6.jpg', '51fe261b-ad32-4c8a-96cd-801ccfa472fa', 'image 3301802228850013597'), ('https://bellingcat-archive.nyc3.cdn.digitaloceanspaces.com/no-dups/bec948c1bf5b87f1f0923e1d/e4c3ddef88ad487ab0305402.jpg', '51fe261b-ad32-4c8a-96cd-801ccfa472fa', 'image 3301802228917101263'), ('https://bellingcat-archive.nyc3.cdn.digitaloceanspaces.com/no-dups/38c66be1f1898ea7e0d20d9d/9143edba90184ea387fc856d.jpg', '51fe261b-ad32-4c8a-96cd-801ccfa472fa', 'image 3301802228916961531'), ('https://bellingcat-archive.nyc3.cdn.digitaloceanspaces.com/no-dups/89f16b246931dc98792d570c/859ea90fd5a84d3cbf1a7b46.mp4', '51fe261b-ad32-4c8a-96cd-801ccfa472fa', 'video 3301801735390140959'), ('https://bellingcat-archive.nyc3.cdn.digitaloceanspaces.com/no-dups/7ee6f00d8a05e22996307c20/5f5faa30541d406e9f0ea1f2.mp4', '51fe261b-ad32-4c8a-96cd-801ccfa472fa', 'video 3301801735750883968')  ... displaying 10 of 587 total bound parameter sets ...  ('https://bellingcat-archive.nyc3.cdn.digitaloceanspaces.com/no-dups/ef3bc69b89e40504cd936e8a/f4a13bf448c04af0b8c3aeae', '51fe261b-ad32-4c8a-96cd-801ccfa472fa', 'timestamp_authority_filesno-dups/ef3bc69b89e40504cd936e8a/f4a13bf448c04af0b8c3aeae_3.0'), ('https://bellingcat-archive.nyc3.cdn.digitaloceanspaces.com/no-dups/561ea3dc87c229eb239266f2/8c292219a03445e396001026.html', '51fe261b-ad32-4c8a-96cd-801ccfa472fa', '_final_media'))]
+
+
+
+2024-02-27 13:03:04.585 | ERROR    | auto_archiver.core.orchestrator:archive:128 - ERROR database gsheet_db: {'code': 400, 'message': 'Invalid data[4]: Your input contains more than the maximum of 50000 characters in a single cell.', 'status': 'INVALID_ARGUMENT'}: Traceback (most recent call last):
+  File "/root/.local/share/virtualenvs/app-4PlAip0Q/lib/python3.10/site-packages/auto_archiver/core/orchestrator.py", line 126, in archive
+    try: d.done(result)
+  File "/root/.local/share/virtualenvs/app-4PlAip0Q/lib/python3.10/site-packages/auto_archiver/databases/gsheet_db.py", line 94, in done
+    gw.batch_set_cell(cell_updates)
+  File "/root/.local/share/virtualenvs/app-4PlAip0Q/lib/python3.10/site-packages/auto_archiver/utils/gworksheet.py", line 104, in batch_set_cell
+    self.wks.batch_update(cell_updates, value_input_option='USER_ENTERED')
+  File "/root/.local/share/virtualenvs/app-4PlAip0Q/lib/python3.10/site-packages/gspread/worksheet.py", line 1361, in batch_update
+    response = self.client.values_batch_update(self.spreadsheet_id, body=body)
+  File "/root/.local/share/virtualenvs/app-4PlAip0Q/lib/python3.10/site-packages/gspread/http_client.py", line 263, in values_batch_update
+    r = self.request("post", url, json=body)
+  File "/root/.local/share/virtualenvs/app-4PlAip0Q/lib/python3.10/site-packages/gspread/http_client.py", line 123, in request
+    raise APIError(response)
+gspread.exceptions.APIError: {'code': 400, 'message': 'Invalid data[4]: Your input contains more than the maximum of 50000 characters in a single cell.', 'status': 'INVALID_ARGUMENT'}
Author	SHA1	Message	Date
msramalho	1e375bd740	version bump	2024-05-14 16:42:15 +01:00
Miguel Sozinho Ramalho	f8824691dd	refactors free twitter archiver strategies (#142 )	2024-05-14 16:23:33 +01:00
msramalho	012cc36609	removes deprecated datetime method	2024-05-14 15:54:50 +01:00
Miguel Sozinho Ramalho	7cfe1e39cc	#135 fix cleanup of telethon session files (#139 ) * closes #135 * version bump	2024-04-16 12:45:45 +01:00
Jett Chen	cf8691bad7	Add yt-dlp based archiving for TwitterArchiver (#138 ) * Add ytdlp archiving capability * Add type annotation * version bump --------- Co-authored-by: msramalho <19508417+msramalho@users.noreply.github.com>	2024-04-15 19:54:55 +01:00
R. Miles McCain	f603400d0d	Add direct Atlos integration (#137 ) * Add Atlos feeder * Add Atlos db * Add Atlos storage * Fix Atlos storages * Fix Atlos feeder * Only include URLs in Atlos feeder once they're processed * Remove print * Add Atlos documentation to README * Formatting fixes * Don't archive existing material * avoid KeyError in atlos_db * version bump --------- Co-authored-by: msramalho <19508417+msramalho@users.noreply.github.com>	2024-04-15 19:25:17 +01:00
msramalho	eb37f0b45b	version bump	2024-04-15 19:02:54 +01:00
msramalho	75497f5773	minor bug fix when using an archiver_enricher in enrichers only	2024-04-15 19:02:40 +01:00
msramalho	623e555713	dependencies updates	2024-04-15 19:02:20 +01:00
msramalho	9c7824de57	browsertrix docker updates	2024-04-15 19:01:55 +01:00
msramalho	f4827770e6	adds instagram no stories as success, and fix for telethon-based archivers.	2024-03-05 14:49:10 +00:00
msramalho	601572d76e	strip url	2024-02-29 11:54:01 +00:00
msramalho	d21e79a272	general security updates	2024-02-29 11:40:30 +00:00