adds instagram no stories as success, and fix for telethon-based archivers.

strip url
general security updates
2026-06-10 20:28:28 +03:00 · 2024-03-05 14:49:10 +00:00 · 2024-02-29 11:54:01 +00:00 · 2024-02-29 11:40:30 +00:00 · 2024-02-25 15:14:17 +00:00 · 2024-02-23 15:54:33 +00:00
15 changed files with 247 additions and 123 deletions
--- a/Pipfile.lock
+++ b/Pipfile.lock
@@ -172,20 +172,20 @@
        },
        "boto3": {
            "hashes": [
-                "sha256:46432fd506708fec6caec4392d758c6f5b79a376dee67d3284fe8b6bfbafeaf4",
-                "sha256:5c96bed1269f77788780aa2005811dc3a37d4122f08b8e54063a1f4c1b9314a1"
+                "sha256:66303b5f26d92afb72656ff490b22ea72dfff8bf1a29e4a0c5d5f11ec56245dd",
+                "sha256:898ad2123b18cae8efd85adc56ac2d1925be54592aebc237020d4f16e9a9e7a9"
            ],
            "index": "pypi",
            "markers": "python_version >= '3.8'",
-            "version": "==1.34.45"
+            "version": "==1.34.52"
        },
        "botocore": {
            "hashes": [
-                "sha256:bf4fe24dd00a6262a27573dea1690ea68eb20f939e7086effadf19aa1acb44d1",
-                "sha256:e17874ac708fef295d2ea16bb2570ea0512c920de9f25f796de0d8c778f06a02"
+                "sha256:05567d8aba344826060481ea309555432c96f0febe22bee7cf5a3b6d3a03cec8",
+                "sha256:187da93aec3f2e87d8a31eced16fa2cb9c71fe2d69b0a797f9f7a9220f5bf7ae"
            ],
            "markers": "python_version >= '3.8'",
-            "version": "==1.34.45"
+            "version": "==1.34.52"
        },
        "brotli": {
            "hashes": [
@@ -273,7 +273,7 @@
                "sha256:fd5f17ff8f14003595ab414e45fce13d073e0762394f957182e69035c9f3d7c2",
                "sha256:fdc3ff3bfccdc6b9cc7c342c03aa2400683f0cb891d46e94b64a197910dc4064"
            ],
-            "markers": "platform_python_implementation >= 'CPython'",
+            "markers": "implementation_name == 'cpython'",
            "version": "==1.1.0"
        },
        "bs4": {
@@ -286,11 +286,11 @@
        },
        "cachetools": {
            "hashes": [
-                "sha256:086ee420196f7b2ab9ca2db2520aca326318b68fe5ba8bc4d49cca91add450f2",
-                "sha256:861f35a13a451f94e301ce2bec7cac63e881232ccce7ed67fab9b5df4d3beaa1"
+                "sha256:0abad1021d3f8325b2fc1d2e9c8b9c9d57b04c3932657a72465447332c24d945",
+                "sha256:ba29e2dfa0b8b556606f097407ed1aa62080ee108ab0dc5ec9d6a723a007d105"
            ],
            "markers": "python_version >= '3.7'",
-            "version": "==5.3.2"
+            "version": "==5.3.3"
        },
        "certifi": {
            "hashes": [
@@ -479,42 +479,42 @@
        },
        "cryptography": {
            "hashes": [
-                "sha256:04859aa7f12c2b5f7e22d25198ddd537391f1695df7057c8700f71f26f47a129",
-                "sha256:069d2ce9be5526a44093a0991c450fe9906cdf069e0e7cd67d9dee49a62b9ebe",
-                "sha256:0d3ec384058b642f7fb7e7bff9664030011ed1af8f852540c76a1317a9dd0d20",
-                "sha256:0fab2a5c479b360e5e0ea9f654bcebb535e3aa1e493a715b13244f4e07ea8eec",
-                "sha256:0fea01527d4fb22ffe38cd98951c9044400f6eff4788cf52ae116e27d30a1ba3",
-                "sha256:1b797099d221df7cce5ff2a1d272761d1554ddf9a987d3e11f6459b38cd300fd",
-                "sha256:1e935c2900fb53d31f491c0de04f41110351377be19d83d908c1fd502ae8daa5",
-                "sha256:20100c22b298c9eaebe4f0b9032ea97186ac2555f426c3e70670f2517989543b",
-                "sha256:20180da1b508f4aefc101cebc14c57043a02b355d1a652b6e8e537967f1e1b46",
-                "sha256:25b09b73db78facdfd7dd0fa77a3f19e94896197c86e9f6dc16bce7b37a96504",
-                "sha256:2619487f37da18d6826e27854a7f9d4d013c51eafb066c80d09c63cf24505306",
-                "sha256:2eb6368d5327d6455f20327fb6159b97538820355ec00f8cc9464d617caecead",
-                "sha256:35772a6cffd1f59b85cb670f12faba05513446f80352fe811689b4e439b5d89e",
-                "sha256:39d5c93e95bcbc4c06313fc6a500cee414ee39b616b55320c1904760ad686938",
-                "sha256:3d96ea47ce6d0055d5b97e761d37b4e84195485cb5a38401be341fabf23bc32a",
-                "sha256:4dcab7c25e48fc09a73c3e463d09ac902a932a0f8d0c568238b3696d06bf377b",
-                "sha256:5fbf0f3f0fac7c089308bd771d2c6c7b7d53ae909dce1db52d8e921f6c19bb3a",
-                "sha256:6c25e1e9c2ce682d01fc5e2dde6598f7313027343bd14f4049b82ad0402e52cd",
-                "sha256:762f3771ae40e111d78d77cbe9c1035e886ac04a234d3ee0856bf4ecb3749d54",
-                "sha256:90147dad8c22d64b2ff7331f8d4cddfdc3ee93e4879796f837bdbb2a0b141e0c",
-                "sha256:935cca25d35dda9e7bd46a24831dfd255307c55a07ff38fd1a92119cffc34857",
-                "sha256:93fbee08c48e63d5d1b39ab56fd3fdd02e6c2431c3da0f4edaf54954744c718f",
-                "sha256:9541c69c62d7446539f2c1c06d7046aef822940d248fa4b8962ff0302862cc1f",
-                "sha256:c23f03cfd7d9826cdcbad7850de67e18b4654179e01fe9bc623d37c2638eb4ef",
-                "sha256:c3d1f5a1d403a8e640fa0887e9f7087331abb3f33b0f2207d2cc7f213e4a864c",
-                "sha256:d1998e545081da0ab276bcb4b33cce85f775adb86a516e8f55b3dac87f469548",
-                "sha256:d5cf11bc7f0b71fb71af26af396c83dfd3f6eed56d4b6ef95d57867bf1e4ba65",
-                "sha256:db0480ffbfb1193ac4e1e88239f31314fe4c6cdcf9c0b8712b55414afbf80db4",
-                "sha256:de4ae486041878dc46e571a4c70ba337ed5233a1344c14a0790c4c4be4bbb8b4",
-                "sha256:de5086cd475d67113ccb6f9fae6d8fe3ac54a4f9238fd08bfdb07b03d791ff0a",
-                "sha256:df34312149b495d9d03492ce97471234fd9037aa5ba217c2a6ea890e9166f151",
-                "sha256:ead69ba488f806fe1b1b4050febafdbf206b81fa476126f3e16110c818bac396"
+                "sha256:0270572b8bd2c833c3981724b8ee9747b3ec96f699a9665470018594301439ee",
+                "sha256:111a0d8553afcf8eb02a4fea6ca4f59d48ddb34497aa8706a6cf536f1a5ec576",
+                "sha256:16a48c23a62a2f4a285699dba2e4ff2d1cff3115b9df052cdd976a18856d8e3d",
+                "sha256:1b95b98b0d2af784078fa69f637135e3c317091b615cd0905f8b8a087e86fa30",
+                "sha256:1f71c10d1e88467126f0efd484bd44bca5e14c664ec2ede64c32f20875c0d413",
+                "sha256:2424ff4c4ac7f6b8177b53c17ed5d8fa74ae5955656867f5a8affaca36a27abb",
+                "sha256:2bce03af1ce5a5567ab89bd90d11e7bbdff56b8af3acbbec1faded8f44cb06da",
+                "sha256:329906dcc7b20ff3cad13c069a78124ed8247adcac44b10bea1130e36caae0b4",
+                "sha256:37dd623507659e08be98eec89323469e8c7b4c1407c85112634ae3dbdb926fdd",
+                "sha256:3eaafe47ec0d0ffcc9349e1708be2aaea4c6dd4978d76bf6eb0cb2c13636c6fc",
+                "sha256:5e6275c09d2badf57aea3afa80d975444f4be8d3bc58f7f80d2a484c6f9485c8",
+                "sha256:6fe07eec95dfd477eb9530aef5bead34fec819b3aaf6c5bd6d20565da607bfe1",
+                "sha256:7367d7b2eca6513681127ebad53b2582911d1736dc2ffc19f2c3ae49997496bc",
+                "sha256:7cde5f38e614f55e28d831754e8a3bacf9ace5d1566235e39d91b35502d6936e",
+                "sha256:9481ffe3cf013b71b2428b905c4f7a9a4f76ec03065b05ff499bb5682a8d9ad8",
+                "sha256:98d8dc6d012b82287f2c3d26ce1d2dd130ec200c8679b6213b3c73c08b2b7940",
+                "sha256:a011a644f6d7d03736214d38832e030d8268bcff4a41f728e6030325fea3e400",
+                "sha256:a2913c5375154b6ef2e91c10b5720ea6e21007412f6437504ffea2109b5a33d7",
+                "sha256:a30596bae9403a342c978fb47d9b0ee277699fa53bbafad14706af51fe543d16",
+                "sha256:b03c2ae5d2f0fc05f9a2c0c997e1bc18c8229f392234e8a0194f202169ccd278",
+                "sha256:b6cd2203306b63e41acdf39aa93b86fb566049aeb6dc489b70e34bcd07adca74",
+                "sha256:b7ffe927ee6531c78f81aa17e684e2ff617daeba7f189f911065b2ea2d526dec",
+                "sha256:b8cac287fafc4ad485b8a9b67d0ee80c66bf3574f655d3b97ef2e1082360faf1",
+                "sha256:ba334e6e4b1d92442b75ddacc615c5476d4ad55cc29b15d590cc6b86efa487e2",
+                "sha256:ba3e4a42397c25b7ff88cdec6e2a16c2be18720f317506ee25210f6d31925f9c",
+                "sha256:c41fb5e6a5fe9ebcd58ca3abfeb51dffb5d83d6775405305bfa8715b76521922",
+                "sha256:cd2030f6650c089aeb304cf093f3244d34745ce0cfcc39f20c6fbfe030102e2a",
+                "sha256:cd65d75953847815962c84a4654a84850b2bb4aed3f26fadcc1c13892e1e29f6",
+                "sha256:e4985a790f921508f36f81831817cbc03b102d643b5fcb81cd33df3fa291a1a1",
+                "sha256:e807b3188f9eb0eaa7bbb579b462c5ace579f1cedb28107ce8b48a9f7ad3679e",
+                "sha256:f12764b8fffc7a123f641d7d049d382b73f96a34117e0b637b80643169cec8ac",
+                "sha256:f8837fe1d6ac4a8052a9a8ddab256bc006242696f03368a4009be7ee3075cdb7"
            ],
            "index": "pypi",
            "markers": "python_version >= '3.7'",
-            "version": "==42.0.3"
+            "version": "==42.0.5"
        },
        "dataclasses-json": {
            "hashes": [
@@ -651,10 +651,11 @@
        },
        "future": {
            "hashes": [
-                "sha256:34a17436ed1e96697a86f9de3d15a3b0be01d8bc8de9c1dffd59fb8234ed5307"
+                "sha256:929292d34f5872e70396626ef385ec22355a1fae8ad29e1a734c3e43f9fbc216",
+                "sha256:bd2968309307861edae1458a4f8a4f3598c03be43b97521076aebf5d94c07b05"
            ],
            "markers": "python_version >= '2.6' and python_version not in '3.0, 3.1, 3.2, 3.3'",
-            "version": "==0.18.3"
+            "version": "==1.0.0"
        },
        "google-api-core": {
            "hashes": [
@@ -666,20 +667,20 @@
        },
        "google-api-python-client": {
            "hashes": [
-                "sha256:9d83b178496b180e058fd206ebfb70ea1afab49f235dd326f557513f56f496d5",
-                "sha256:ebf4927a3f5184096647be8f705d090e7f06d48ad82b0fa431a2fe80c2cbe182"
+                "sha256:84e43bdb58dd8d2301669513863996378ffe9a3bf6d23b5ccd4f1e021323dbeb",
+                "sha256:ff9ef7539eaf7e088a481b25d1af4704210b07863e1d51b5ee498b910a3a46a3"
            ],
            "index": "pypi",
            "markers": "python_version >= '3.7'",
-            "version": "==2.118.0"
+            "version": "==2.119.0"
        },
        "google-auth": {
            "hashes": [
-                "sha256:3cfc1b6e4e64797584fb53fc9bd0b7afa9b7c0dba2004fa7dcc9349e58cc3195",
-                "sha256:7634d29dcd1e101f5226a23cbc4a0c6cda6394253bf80e281d9c5c6797869c53"
+                "sha256:25141e2d7a14bfcba945f5e9827f98092716e99482562f15306e5b026e21aa72",
+                "sha256:34fc3046c257cedcf1622fc4b31fc2be7923d9b4d44973d481125ecc50d83885"
            ],
            "markers": "python_version >= '3.7'",
-            "version": "==2.28.0"
+            "version": "==2.28.1"
        },
        "google-auth-httplib2": {
            "hashes": [
@@ -725,11 +726,11 @@
        },
        "httpcore": {
            "hashes": [
-                "sha256:5c0f9546ad17dac4d0772b0808856eb616eb8b48ce94f49ed819fd6982a8a544",
-                "sha256:9a6a501c3099307d9fd76ac244e08503427679b1e81ceb1d922485e2f2462ad2"
+                "sha256:ac418c1db41bade2ad53ae2f3834a3a0f5ae76b56cf5aa497d2d033384fc7d73",
+                "sha256:cb2839ccfcba0d2d3c1131d3c3e26dfc327326fbe7a5dc0dbfe9f6c9151bb022"
            ],
            "markers": "python_version >= '3.8'",
-            "version": "==1.0.3"
+            "version": "==1.0.4"
        },
        "httplib2": {
            "hashes": [
@@ -741,11 +742,11 @@
        },
        "httpx": {
            "hashes": [
-                "sha256:451b55c30d5185ea6b23c2c793abf9bb237d2a7dfb901ced6ff69ad37ec1dfaf",
-                "sha256:8915f5a3627c4d47b73e8202457cb28f1266982d1159bd5779d86a80c0eab1cd"
+                "sha256:71d5465162c13681bff01ad59b2cc68dd838ea1f10e51574bac27103f00c91a5",
+                "sha256:a0cb88a46f32dc874e04ee956e4c2764aba2aa228f650b06788ba6bda2962ab5"
            ],
            "markers": "python_version >= '3.8'",
-            "version": "==0.26.0"
+            "version": "==0.27.0"
        },
        "idna": {
            "hashes": [
@@ -966,11 +967,11 @@
        },
        "marshmallow": {
            "hashes": [
-                "sha256:4c1daff273513dc5eb24b219a8035559dc573c8f322558ef85f5438ddd1236dd",
-                "sha256:c21d4b98fee747c130e6bc8f45c4b3199ea66bc00c12ee1f639f0aeca034d5e9"
+                "sha256:20f53be28c6e374a711a16165fb22a8dc6003e3f7cda1285e3ca777b9193885b",
+                "sha256:e7997f83571c7fd476042c2c188e4ee8a78900ca5e74bd9c8097afa56624e9bd"
            ],
            "markers": "python_version >= '3.8'",
-            "version": "==3.20.2"
+            "version": "==3.21.0"
        },
        "mdurl": {
            "hashes": [
@@ -1648,11 +1649,11 @@
        },
        "rich": {
            "hashes": [
-                "sha256:5cb5123b5cf9ee70584244246816e9114227e0b98ad9176eede6ad54bf5403fa",
-                "sha256:6da14c108c4866ee9520bbffa71f6fe3962e193b7da68720583850cd4548e235"
+                "sha256:4edbae314f59eb482f54e9e30bf00d33350aaa94f4bfcd4e9e3110e64d0d7222",
+                "sha256:9be308cb1fe2f1f57d67ce99e95af38a1e2bc71ad9813b0e247cf7ffbcc3a432"
            ],
            "markers": "python_full_version >= '3.7.0'",
-            "version": "==13.7.0"
+            "version": "==13.7.1"
        },
        "rsa": {
            "hashes": [
@@ -1689,11 +1690,11 @@
        },
        "sniffio": {
            "hashes": [
-                "sha256:e60305c5e5d314f5389259b7f22aaa33d8f7dee49763119234af3755c55b9101",
-                "sha256:eecefdce1e5bbfb7ad2eeaabf7c1eeb404d7757c379bd1f7e5cce9d8bf425384"
+                "sha256:2f6da418d1f1e0fddd844478f41680e794e6051915791a034ff65e5f100525a2",
+                "sha256:f4324edc670a0f49750a81b895f35c3adb843cca46f0530f79fc1babb23789dc"
            ],
            "markers": "python_version >= '3.7'",
-            "version": "==1.3.0"
+            "version": "==1.3.1"
        },
        "snscrape": {
            "hashes": [
@@ -1783,11 +1784,11 @@
        },
        "typing-extensions": {
            "hashes": [
-                "sha256:23478f88c37f27d76ac8aee6c905017a143b0b1b886c3c9f66bc2fd94f9f5783",
-                "sha256:af72aea155e91adfc61c3ae9e0e342dbc0cba726d6cba4b6c72c1f34e47291cd"
+                "sha256:69b1a937c3a517342112fb4c6df7e72fc39a38e7891a5730ed4985b5214b5475",
+                "sha256:b0abd7c89e8fb96f98db18d86106ff1d90ab692004eb746cf6eda2682f91b3cb"
            ],
            "markers": "python_version >= '3.8'",
-            "version": "==4.9.0"
+            "version": "==4.10.0"
        },
        "typing-inspect": {
            "hashes": [
@@ -1922,7 +1923,7 @@
                "sha256:fc4e7fa5414512b481a2483775a8e8be7803a35b30ca805afa4998a84f9fd9e8",
                "sha256:ffefa1374cd508d633646d51a8e9277763a9b78ae71324183693959cf94635a7"
            ],
-            "markers": "python_version >= '3.7'",
+            "markers": "python_version >= '3.8'",
            "version": "==12.0"
        },
        "werkzeug": {
--- a/example.orchestration.yaml
+++ b/example.orchestration.yaml
@@ -7,6 +7,7 @@ steps:
    # - telegram_archiver
    # - twitter_archiver
    # - twitter_api_archiver
+    # - instagram_api_archiver
    # - instagram_tbot_archiver
    # - instagram_archiver
    # - tiktok_archiver
--- a/src/auto_archiver/archivers/instagram_api_archiver.py
+++ b/src/auto_archiver/archivers/instagram_api_archiver.py
@@ -22,6 +22,7 @@ class InstagramAPIArchiver(Archiver):
        super().__init__(config)
        self.assert_valid_string("access_token")
        self.assert_valid_string("api_endpoint")
+        self.full_profile_max_posts = int(self.full_profile_max_posts)
        if self.api_endpoint[-1] == "/": self.api_endpoint = self.api_endpoint[:-1]

        self.full_profile = bool(self.full_profile)
@@ -33,6 +34,7 @@ class InstagramAPIArchiver(Archiver):
            "access_token": {"default": None, "help": "a valid instagrapi-api token"},
            "api_endpoint": {"default": None, "help": "API endpoint to use"},
            "full_profile": {"default": False, "help": "if true, will download all posts, tagged posts, stories, and highlights for a profile, if false, will only download the profile pic and information."},
+            "full_profile_max_posts": {"default": 0, "help": "Use to limit the number of posts to download when full_profile is true. 0 means no limit. limit is applied softly since posts are fetched in batch, once to: posts, tagged posts, and highlights"},
            "minimize_json_output": {"default": True, "help": "if true, will remove empty values from the json output"},
        }
    
@@ -73,9 +75,9 @@ class InstagramAPIArchiver(Archiver):
        if type(d) == list: return [self.cleanup_dict(v) for v in d]
        if type(d) != dict: return d
        return {
-                k: self.cleanup_dict(v) if type(v) in [dict, list] else v 
+                k: clean_v
                for k, v in d.items() 
-                if v not in [0.0, 0, [], {}, "", None, "null"] and
+                if (clean_v := self.cleanup_dict(v)) not in [0.0, 0, [], {}, "", None, "null"] and
                k not in ["x", "y", "width", "height"]
        }

@@ -93,9 +95,6 @@ class InstagramAPIArchiver(Archiver):

        if self.full_profile:
            user_id = user.get("pk")
-            # download all posts
-            self.download_all_posts(result, user_id)
-
            # download all stories
            try:
                stories = self._download_stories_reusable(result, username)
@@ -104,25 +103,46 @@ class InstagramAPIArchiver(Archiver):
                result.append("errors", f"Error downloading stories for {username}")
                logger.error(f"Error downloading stories for {username}: {e}")

+            # download all posts
+            try:
+                self.download_all_posts(result, user_id)
+            except Exception as e:
+                result.append("errors", f"Error downloading posts for {username}")
+                logger.error(f"Error downloading posts for {username}: {e}")
+
+            # download all tagged
+            try:
+                self.download_all_tagged(result, user_id)
+            except Exception as e:
+                result.append("errors", f"Error downloading tagged posts for {username}")
+                logger.error(f"Error downloading tagged posts for {username}: {e}")
+
            # download all highlights
            try:
-                count_highlights = 0
-                highlights = self.call_api(f"v1/user/highlights", {"user_id": user_id})
-                for h in highlights:
-                    try: 
-                        h_info = self._download_highlights_reusable(result, h.get("pk"))
-                        count_highlights += len(h_info.get("items", []))
-                    except Exception as e:
-                        result.append("errors", f"Error downloading highlight id{h.get('pk')} for {username}")
-                        logger.error(f"Error downloading highlight id{h.get('pk')} for {username}: {e}")
-                result.set("#highlights", count_highlights)
+                self.download_all_highlights(result, username, user_id)
            except Exception as e:
                result.append("errors", f"Error downloading highlights for {username}")
                logger.error(f"Error downloading highlights for {username}: {e}")

+
        result.set_url(url) # reset as scrape_item modifies it
        return result.success("insta profile")

+    def download_all_highlights(self, result, username, user_id):
+        count_highlights = 0
+        highlights = self.call_api(f"v1/user/highlights", {"user_id": user_id})
+        for h in highlights:
+            try: 
+                h_info = self._download_highlights_reusable(result, h.get("pk"))
+                count_highlights += len(h_info.get("items", []))
+            except Exception as e:
+                result.append("errors", f"Error downloading highlight id{h.get('pk')} for {username}")
+                logger.error(f"Error downloading highlight id{h.get('pk')} for {username}: {e}")
+            if self.full_profile_max_posts and count_highlights >= self.full_profile_max_posts:
+                logger.info(f"HIGHLIGHTS reached full_profile_max_posts={self.full_profile_max_posts}")
+                break
+        result.set("#highlights", count_highlights)
+
    def download_post(self, result: Metadata, code: str = None, id: str = None, context: str = None) -> Metadata:
        if id:
            post = self.call_api(f"v1/media/by/id", {"id": id})
@@ -166,12 +186,13 @@ class InstagramAPIArchiver(Archiver):
    def download_stories(self, result: Metadata, username: str) -> Metadata:
        now = datetime.now().strftime("%Y-%m-%d_%H-%M")
        stories = self._download_stories_reusable(result, username)
+        if stories == []: return result.success("insta no story")
        result.set_title(f"stories {username} at {now}").set("#stories", len(stories))
        return result.success(f"insta stories {now}")
    
    def _download_stories_reusable(self, result: Metadata, username: str) -> list[dict]:
        stories = self.call_api(f"v1/user/stories/by/username", {"username": username})
-        assert stories, f"Stories for {username} not found"
+        if not stories or not len(stories): return []
        stories = stories[::-1] # newest to oldest

        for s in tqdm(stories, desc="downloading stories", unit="story"):
@@ -188,7 +209,7 @@ class InstagramAPIArchiver(Archiver):
        post_count = 0
        while end_cursor != "":
            posts = self.call_api(f"v1/user/medias/chunk", {"user_id": user_id, "end_cursor": end_cursor})
-            if not len(posts): break
+            if not len(posts) or not type(posts) == list or len(posts) != 2: break
            posts, end_cursor = posts[0], posts[1]
            logger.info(f"parsing {len(posts)} posts, next {end_cursor=}")

@@ -199,7 +220,35 @@ class InstagramAPIArchiver(Archiver):
                    logger.error(f"Error downloading post, skipping {p.get('id')}: {e}")
                pbar.update(1)
                post_count+=1
+            if self.full_profile_max_posts and post_count >= self.full_profile_max_posts:
+                logger.info(f"POSTS reached full_profile_max_posts={self.full_profile_max_posts}")
+                break
        result.set("#posts", post_count)
+        
+    def download_all_tagged(self, result: Metadata, user_id: str):
+        next_page_id = ""
+        pbar = tqdm(desc="downloading tagged posts")
+
+        tagged_count = 0
+        while next_page_id != None:
+            resp = self.call_api(f"v2/user/tag/medias", {"user_id": user_id, "page_id": next_page_id})
+            posts = resp.get("response", {}).get("items", [])
+            if not len(posts): break
+            next_page_id = resp.get("next_page_id")
+            
+            logger.info(f"parsing {len(posts)} tagged posts, next {next_page_id=}")
+
+            for p in posts:
+                try: self.scrape_item(result, p, "tagged")
+                except Exception as e:
+                    result.append("errors", f"Error downloading tagged post {p.get('id')}")
+                    logger.error(f"Error downloading tagged post, skipping {p.get('id')}: {e}")
+                pbar.update(1)
+                tagged_count+=1
+            if self.full_profile_max_posts and tagged_count >= self.full_profile_max_posts:
+                logger.info(f"TAGS reached full_profile_max_posts={self.full_profile_max_posts}")
+                break
+        result.set("#tagged", tagged_count)


 ### reusable parsing utils below
@@ -217,10 +266,10 @@ class InstagramAPIArchiver(Archiver):
            if self.minimize_json_output: 
                del item["clips_metadata"]

-        if code := item.get("code"): 
-            result.set("url", f"https://www.instagram.com/p/{code}/")
+        if code := item.get("code") and not result.get("url"): 
+            result.set_url(f"https://www.instagram.com/p/{code}/")
            
-        resources = item.get("resources", [])
+        resources = item.get("resources", item.get("carousel_media", []))
        item, media, media_id = self.scrape_media(item, context)
        # if resources are present take the main media from the first resource
        if not media and len(resources):
@@ -242,7 +291,7 @@ class InstagramAPIArchiver(Archiver):
    def scrape_media(self, item: dict, context:str) -> tuple[dict, Media, str]:
        # remove unnecessary info
        if self.minimize_json_output: 
-            for k in ["image_versions", "video_versions", "video_dash_manifest"]:
+            for k in ["image_versions", "video_versions", "video_dash_manifest", "image_versions2", "video_versions2"]:
                if k in item: del item[k]
        item = self.cleanup_dict(item)

@@ -253,19 +302,24 @@ class InstagramAPIArchiver(Archiver):
            
        # retrieve video info
        best_id = item.get('id', item.get('pk'))
-        taken_at = item.get("taken_at")
+        taken_at = item.get("taken_at", item.get("taken_at_ts"))
        code = item.get("code")
+        caption_text = item.get("caption_text")
+        if "carousel_media" in item: del item["carousel_media"]
+
        if video_url := item.get("video_url"):
            filename = self.download_from_url(video_url, verbose=False)
            video_media = Media(filename=filename)
            if taken_at: video_media.set("date", taken_at)
            if code: video_media.set("url", f"https://www.instagram.com/p/{code}")
+            if caption_text: video_media.set("text", caption_text)
            video_media.set("preview", [image_media])
            video_media.set("data", [item])
            return item, video_media, f"{context or 'video'} {best_id}"
        elif image_media:
            if taken_at: image_media.set("date", taken_at)
            if code: image_media.set("url", f"https://www.instagram.com/p/{code}")
+            if caption_text: image_media.set("text", caption_text)
            image_media.set("data", [item])
            return item, image_media, f"{context or 'image'} {best_id}"
        
--- a/src/auto_archiver/archivers/instagram_tbot_archiver.py
+++ b/src/auto_archiver/archivers/instagram_tbot_archiver.py
@@ -42,7 +42,7 @@ class InstagramTbotArchiver(Archiver):
        # make a copy of the session that is used exclusively with this archiver instance
        new_session_file = os.path.join("secrets/", f"instabot-{time.strftime('%Y-%m-%d')}{random_str(8)}.session")
        shutil.copy(self.session_file + ".session", new_session_file)
-        self.session_file = new_session_file
+        self.session_file = new_session_file.replace(".session", "")

        try:
            self.client = TelegramClient(self.session_file, self.api_id, self.api_hash)
--- a/src/auto_archiver/archivers/telethon_archiver.py
+++ b/src/auto_archiver/archivers/telethon_archiver.py
@@ -49,7 +49,7 @@ class TelethonArchiver(Archiver):
        # make a copy of the session that is used exclusively with this archiver instance
        new_session_file = os.path.join("secrets/", f"telethon-{time.strftime('%Y-%m-%d')}{random_str(8)}.session")
        shutil.copy(self.session_file + ".session", new_session_file)
-        self.session_file = new_session_file
+        self.session_file = new_session_file.replace(".session", "")

        # initiate the client
        self.client = TelegramClient(self.session_file, self.api_id, self.api_hash)
--- a/src/auto_archiver/archivers/youtubedl_archiver.py
+++ b/src/auto_archiver/archivers/youtubedl_archiver.py
@@ -15,6 +15,8 @@ class YoutubeDLArchiver(Archiver):
        self.livestreams = bool(self.livestreams)
        self.live_from_start = bool(self.live_from_start)
        self.end_means_success = bool(self.end_means_success)
+        self.allow_playlist = bool(self.allow_playlist)
+        self.max_downloads = self.max_downloads

    @staticmethod
    def configs() -> dict:
@@ -26,6 +28,8 @@ class YoutubeDLArchiver(Archiver):
            "live_from_start": {"default": False, "help": "if set, will download live streams from their earliest available moment, otherwise starts now."},
            "proxy": {"default": "", "help": "http/socks (https seems to not work atm) proxy to use for the webdriver, eg https://proxy-user:password@proxy-ip:port"},
            "end_means_success": {"default": True, "help": "if True, any archived content will mean a 'success', if False this archiver will not return a 'success' stage; this is useful for cases when the yt-dlp will archive a video but ignore other types of content like images or text only pages that the subsequent archivers can retrieve."},
+            'allow_playlist': {"default": False, "help": "If True will also download playlists, set to False if the expectation is to download a single video."},
+            "max_downloads": {"default": "inf", "help": "Use to limit the number of videos to download when a channel or long page is being extracted. 'inf' means no limit."},
        }

    def download(self, item: Metadata) -> Metadata:
@@ -35,11 +39,11 @@ class YoutubeDLArchiver(Archiver):
            logger.debug('Using Facebook cookie')
            yt_dlp.utils.std_headers['cookie'] = self.facebook_cookie

-        ydl_options = {'outtmpl': os.path.join(ArchivingContext.get_tmp_dir(), f'%(id)s.%(ext)s'), 'quiet': False, 'noplaylist': True, 'writesubtitles': self.subtitles, 'writeautomaticsub': self.subtitles, "live_from_start": self.live_from_start, "proxy": self.proxy}
+        ydl_options = {'outtmpl': os.path.join(ArchivingContext.get_tmp_dir(), f'%(id)s.%(ext)s'), 'quiet': False, 'noplaylist': not self.allow_playlist , 'writesubtitles': self.subtitles, 'writeautomaticsub': self.subtitles, "live_from_start": self.live_from_start, "proxy": self.proxy, "max_downloads": self.max_downloads, "playlistend": self.max_downloads}
        ydl = yt_dlp.YoutubeDL(ydl_options) # allsubtitles and subtitleslangs not working as expected, so default lang is always "en"

        try:
-            # don'd download since it can be a live stream
+            # don't download since it can be a live stream
            info = ydl.extract_info(url, download=False)
            if info.get('is_live', False) and not self.livestreams:
                logger.warning("Livestream detected, skipping due to 'livestreams' configuration setting")
@@ -52,7 +56,8 @@ class YoutubeDLArchiver(Archiver):
            return False

        # this time download
-        ydl = yt_dlp.YoutubeDL({**ydl_options, "getcomments": self.comments}) 
+        ydl = yt_dlp.YoutubeDL({**ydl_options, "getcomments": self.comments})
+        #TODO: for playlist or long lists of videos, how to download one at a time so they can be stored before the next one is downloaded?
        info = ydl.extract_info(url, download=True)

        if "entries" in info:
@@ -64,13 +69,17 @@ class YoutubeDLArchiver(Archiver):

        result = Metadata()
        result.set_title(info.get("title"))
+        if "description" in info: result.set_content(info["description"])
        for entry in entries:
            try:
                filename = ydl.prepare_filename(entry)
                if not os.path.exists(filename):
                    filename = filename.split('.')[0] + '.mkv'
-                new_media = Media(filename).set("duration", info.get("duration"))
-                
+
+                new_media = Media(filename)
+                for x in ["duration", "original_url", "fulltitle", "description", "upload_date"]:
+                    if x in entry: new_media.set(x, entry[x])
+
                # read text from subtitles if enabled
                if self.subtitles:
                    for lang, val in (info.get('requested_subtitles') or {}).items():
--- a/src/auto_archiver/core/orchestrator.py
+++ b/src/auto_archiver/core/orchestrator.py
@@ -1,5 +1,7 @@
 from __future__ import annotations
 from typing import Generator, Union, List
+from urllib.parse import urlparse
+from ipaddress import ip_address

 from .context import ArchivingContext

@@ -60,7 +62,9 @@ class ArchivingOrchestrator:
            exit()
        except Exception as e:
            logger.error(f'Got unexpected error on item {item}: {e}\n{traceback.format_exc()}')
-            for d in self.databases: d.failed(item)
+            for d in self.databases:
+                if type(e) == AssertionError: d.failed(item, str(e))
+                else: d.failed(item)


    def archive(self, result: Metadata) -> Union[Metadata, None]:
@@ -73,7 +77,8 @@ class ArchivingOrchestrator:
            5. Store all downloaded/generated media
            6. Call selected Formatter and store formatted if needed
        """
-        original_url = result.get_url()
+        original_url = result.get_url().strip()
+        self.assert_valid_url(original_url)

        # 1 - sanitize - each archiver is responsible for cleaning/expanding its own URLs
        url = original_url
@@ -90,7 +95,9 @@ class ArchivingOrchestrator:
        if cached_result:
            logger.debug("Found previously archived entry")
            for d in self.databases:
-                d.done(cached_result, cached=True)
+                try: d.done(cached_result, cached=True)
+                except Exception as e:
+                    logger.error(f"ERROR database {d.name}: {e}: {traceback.format_exc()}")
            return cached_result

        # 3 - call archivers until one succeeds
@@ -120,6 +127,29 @@ class ArchivingOrchestrator:
            result.status = "nothing archived"

        # signal completion to databases and archivers
-        for d in self.databases: d.done(result)
+        for d in self.databases:
+            try: d.done(result)
+            except Exception as e:
+                logger.error(f"ERROR database {d.name}: {e}: {traceback.format_exc()}")

        return result
+
+    def assert_valid_url(self, url: str) -> bool:
+        """
+        Blocks localhost, private, reserved, and link-local IPs and all non-http/https schemes.
+        """
+        assert url.startswith("http://") or url.startswith("https://"), f"Invalid URL scheme"
+        
+        parsed = urlparse(url)
+        assert parsed.scheme in ["http", "https"], f"Invalid URL scheme"
+        assert parsed.hostname, f"Invalid URL hostname"
+        assert parsed.hostname != "localhost", f"Invalid URL"
+
+        try: # special rules for IP addresses
+            ip = ip_address(parsed.hostname)
+        except ValueError: pass
+        else:
+            assert ip.is_global, f"Invalid IP used"
+            assert not ip.is_reserved, f"Invalid IP used"
+            assert not ip.is_link_local, f"Invalid IP used"
+            assert not ip.is_private, f"Invalid IP used"
--- a/src/auto_archiver/databases/api_db.py
+++ b/src/auto_archiver/databases/api_db.py
@@ -23,8 +23,7 @@ class AAApiDb(Database):
    def configs() -> dict:
        return {
            "api_endpoint": {"default": None, "help": "API endpoint where calls are made to"},
-            "api_secret": {"default": None, "help": "API Basic authentication secret [deprecating soon]"},
-            "api_token": {"default": None, "help": "API Bearer token, to be preferred over secret (Basic auth) going forward"},
+            "api_token": {"default": None, "help": "API Bearer token."},
            "public": {"default": False, "help": "whether the URL should be publicly available via the API"},
            "author_id": {"default": None, "help": "which email to assign as author"},
            "group_id": {"default": None, "help": "which group of users have access to the archive in case public=false as author"},
@@ -59,7 +58,7 @@ class AAApiDb(Database):
        logger.debug(f"saving archive of {item.get_url()} to the AA API.")

        payload = {'result': item.to_json(), 'public': self.public, 'author_id': self.author_id, 'group_id': self.group_id, 'tags': list(self.tags)}
-        headers = {"Authorization": f"Bearer {self.api_secret}"}
+        headers = {"Authorization": f"Bearer {self.api_token}"}
        response = requests.post(os.path.join(self.api_endpoint, "submit-archive"), json=payload, headers=headers)

        if response.status_code == 200:
--- a/src/auto_archiver/databases/console_db.py
+++ b/src/auto_archiver/databases/console_db.py
@@ -21,8 +21,8 @@ class ConsoleDb(Database):
    def started(self, item: Metadata) -> None:
        logger.warning(f"STARTED {item}")

-    def failed(self, item: Metadata) -> None:
-        logger.error(f"FAILED {item}")
+    def failed(self, item: Metadata, reason:str) -> None:
+        logger.error(f"FAILED {item}: {reason}")

    def aborted(self, item: Metadata) -> None:
        logger.warning(f"ABORTED {item}")
--- a/src/auto_archiver/databases/database.py
+++ b/src/auto_archiver/databases/database.py
@@ -22,7 +22,7 @@ class Database(Step, ABC):
        """signals the DB that the given item archival has started"""
        pass

-    def failed(self, item: Metadata) -> None:
+    def failed(self, item: Metadata, reason:str) -> None:
        """update DB accordingly for failure"""
        pass

--- a/src/auto_archiver/databases/gsheet_db.py
+++ b/src/auto_archiver/databases/gsheet_db.py
@@ -29,9 +29,9 @@ class GsheetsDb(Database):
        gw, row = self._retrieve_gsheet(item)
        gw.set_cell(row, 'status', 'Archive in progress')

-    def failed(self, item: Metadata) -> None:
+    def failed(self, item: Metadata, reason:str) -> None:
        logger.error(f"FAILED {item}")
-        self._safe_status_update(item, 'Archive failed')
+        self._safe_status_update(item, f'Archive failed {reason}')

    def aborted(self, item: Metadata) -> None:
        logger.warning(f"ABORTED {item}")
@@ -102,6 +102,11 @@ class GsheetsDb(Database):

    def _retrieve_gsheet(self, item: Metadata) -> Tuple[GWorksheet, int]:
        # TODO: to make gsheet_db less coupled with gsheet_feeder's "gsheet" parameter, this method could 1st try to fetch "gsheet" from ArchivingContext and, if missing, manage its own singleton - not needed for now
-        gw: GWorksheet = ArchivingContext.get("gsheet").get("worksheet")
-        row: int = ArchivingContext.get("gsheet").get("row")
+        if gsheet := ArchivingContext.get("gsheet"):
+            gw: GWorksheet = gsheet.get("worksheet")
+            row: int = gsheet.get("row")
+        elif self.sheet_id:
+            print(self.sheet_id)
+
+
        return gw, row
--- a/src/auto_archiver/enrichers/ssl_enricher.py
+++ b/src/auto_archiver/enrichers/ssl_enricher.py
@@ -27,7 +27,10 @@ class SSLEnricher(Enricher):
        if not to_enrich.media and self.skip_when_nothing_archived: return
        
        url = to_enrich.get_url()
-        domain = urlparse(url).netloc
+        parsed = urlparse(url)
+        assert parsed.scheme in ["https"], f"Invalid URL scheme {url=}"
+        
+        domain = parsed.netloc
        logger.debug(f"fetching SSL certificate for {domain=} in {url=}")

        cert = ssl.get_server_certificate((domain, 443))
--- a/src/auto_archiver/enrichers/wacz_enricher.py
+++ b/src/auto_archiver/enrichers/wacz_enricher.py
@@ -35,6 +35,22 @@ class WaczArchiverEnricher(Enricher, Archiver):
            "socks_proxy_host": {"default": None, "help": "SOCKS proxy host for browsertrix-crawler, use in combination with socks_proxy_port. eg: user:password@host"},
            "socks_proxy_port": {"default": None, "help": "SOCKS proxy port for browsertrix-crawler, use in combination with socks_proxy_host. eg 1234"},
        }
+    
+    def setup(self) -> None:
+        self.use_docker = os.environ.get('WACZ_ENABLE_DOCKER') or not os.environ.get('RUNNING_IN_DOCKER')
+        self.docker_in_docker = os.environ.get('WACZ_ENABLE_DOCKER') and os.environ.get('RUNNING_IN_DOCKER')
+
+        self.cwd_dind = f"/crawls/crawls{random_str(8)}"
+        self.browsertrix_home_host = os.environ.get('BROWSERTRIX_HOME_HOST')
+        self.browsertrix_home_container = os.environ.get('BROWSERTRIX_HOME_CONTAINER') or self.browsertrix_home_host
+        # create crawls folder if not exists, so it can be safely removed in cleanup
+        if self.docker_in_docker:
+            os.makedirs(self.cwd_dind, exist_ok=True)
+
+    def cleanup(self) -> None:
+        if self.docker_in_docker:
+            logger.debug(f"Removing {self.cwd_dind=}")
+            shutil.rmtree(self.cwd_dind, ignore_errors=True)

    def download(self, item: Metadata) -> Metadata:
        # this new Metadata object is required to avoid duplication
@@ -51,8 +67,8 @@ class WaczArchiverEnricher(Enricher, Archiver):
        url = to_enrich.get_url()

        collection = random_str(8)
-        browsertrix_home_host = os.environ.get('BROWSERTRIX_HOME_HOST') or os.path.abspath(ArchivingContext.get_tmp_dir())
-        browsertrix_home_container = os.environ.get('BROWSERTRIX_HOME_CONTAINER') or browsertrix_home_host
+        browsertrix_home_host = self.browsertrix_home_host or os.path.abspath(ArchivingContext.get_tmp_dir())
+        browsertrix_home_container = self.browsertrix_home_container or browsertrix_home_host

        cmd = [
            "crawl",
@@ -67,11 +83,12 @@ class WaczArchiverEnricher(Enricher, Archiver):
            "--behaviors", "autoscroll,autoplay,autofetch,siteSpecific",
            "--behaviorTimeout", str(self.timeout),
            "--timeout", str(self.timeout)]
+        
+        if self.docker_in_docker:
+            cmd.extend(["--cwd", self.cwd_dind])

        # call docker if explicitly enabled or we are running on the host (not in docker)
-        use_docker = os.environ.get('WACZ_ENABLE_DOCKER') or not os.environ.get('RUNNING_IN_DOCKER')
-
-        if use_docker:
+        if self.use_docker:
            logger.debug(f"generating WACZ in Docker for {url=}")
            logger.debug(f"{browsertrix_home_host=} {browsertrix_home_container=}")
            if self.docker_commands:
@@ -103,7 +120,10 @@ class WaczArchiverEnricher(Enricher, Archiver):
            logger.error(f"WACZ generation failed: {e}")
            return False

-        if use_docker:
+        
+        if self.docker_in_docker:
+            wacz_fn = os.path.join(self.cwd_dind, "collections", collection, f"{collection}.wacz")
+        elif self.use_docker:
            wacz_fn = os.path.join(browsertrix_home_container, "collections", collection, f"{collection}.wacz")
        else:
            wacz_fn = os.path.join("collections", collection, f"{collection}.wacz")
@@ -116,7 +136,9 @@ class WaczArchiverEnricher(Enricher, Archiver):
        if self.extract_media or self.extract_screenshot:
            self.extract_media_from_wacz(to_enrich, wacz_fn)

-        if use_docker:
+        if self.docker_in_docker:
+            jsonl_fn = os.path.join(self.cwd_dind, "collections", collection, "pages", "pages.jsonl")
+        elif self.use_docker:
            jsonl_fn = os.path.join(browsertrix_home_container, "collections", collection, "pages", "pages.jsonl")
        else:
            jsonl_fn = os.path.join("collections", collection, "pages", "pages.jsonl")
--- a/src/auto_archiver/formatters/html_formatter.py
+++ b/src/auto_archiver/formatters/html_formatter.py
@@ -21,7 +21,7 @@ class HtmlFormatter(Formatter):
    def __init__(self, config: dict) -> None:
        # without this STEP.__init__ is not called
        super().__init__(config)
-        self.environment = Environment(loader=FileSystemLoader(os.path.join(pathlib.Path(__file__).parent.resolve(), "templates/")))
+        self.environment = Environment(loader=FileSystemLoader(os.path.join(pathlib.Path(__file__).parent.resolve(), "templates/")), autoescape=True)
        # JinjaHelper class static methods are added as filters
        self.environment.filters.update({
            k: v.__func__ for k, v in JinjaHelpers.__dict__.items() if isinstance(v, staticmethod)
--- a/src/auto_archiver/version.py
+++ b/src/auto_archiver/version.py
@@ -1,6 +1,6 @@

 _MAJOR = "0"
-_MINOR = "9"
+_MINOR = "10"
 # On main and in a nightly release the patch should be one ahead of the last
 # released build.
 _PATCH = "1"
Author	SHA1	Message	Date
msramalho	f4827770e6	adds instagram no stories as success, and fix for telethon-based archivers.	2024-03-05 14:49:10 +00:00
msramalho	601572d76e	strip url	2024-02-29 11:54:01 +00:00
msramalho	d21e79a272	general security updates	2024-02-29 11:40:30 +00:00
msramalho	ccf5f857ef	adds configurable limits to instagram/youtube	2024-02-25 15:14:17 +00:00
msramalho	7de317d1b5	avoiding exception	2024-02-23 15:54:33 +00:00
msramalho	70075a1e5e	improving insta archiver	2024-02-23 15:37:28 +00:00
msramalho	5b9bc4919a	version bump	2024-02-23 14:08:23 +00:00
msramalho	f0158ffd9c	adds tagged posts and better parsing	2024-02-23 14:08:17 +00:00
msramalho	bfb35a43a9	adds more details from yt-dlp	2024-02-23 14:08:05 +00:00
msramalho	ef5b39c4f1	dind exception	2024-02-22 18:05:56 +00:00
msramalho	24ceafcb64	missing forward slash	2024-02-22 17:47:13 +00:00
msramalho	9fd4bb56a8	new attempt at dind wacz	2024-02-22 17:24:27 +00:00
msramalho	5324d562ba	cleanup wacz patch	2024-02-21 18:14:30 +00:00
msramalho	5bf0a0206d	version update	2024-02-21 17:26:07 +00:00
msramalho	4941823565	fix growing volume size in wacz_enricher	2024-02-21 17:25:55 +00:00
msramalho	27310c2911	fixes issue with api requests	2024-02-21 12:25:05 +00:00